Full Code of bkelly-lab/ReplicationCrisis for AI

master 2434da1281aa cached
29 files
430.1 KB
142.7k tokens
1 requests
Download .txt
Showing preview only (445K chars total). Download the full file or copy to clipboard to get everything.
Repository: bkelly-lab/ReplicationCrisis
Branch: master
Commit: 2434da1281aa
Files: 29
Total size: 430.1 KB

Directory structure:
gitextract_789kjfg8/

├── .gitignore
├── Analysis/
│   ├── .gitignore
│   ├── 0 - Functions.R
│   ├── 1 - Prepare Data.R
│   ├── 2 - Determine Clusters.R
│   ├── 3 - Analysis.R
│   ├── 4 - Output.R
│   ├── Analysis.Rproj
│   ├── Country Classification.xlsx
│   ├── Factor Details.xlsx
│   ├── README.md
│   ├── country_stats.R
│   ├── hxz_decomp.R
│   └── main.R
├── GlobalFactors/
│   ├── CHANGELOG.md
│   ├── Cluster Labels.csv
│   ├── Country Classification.xlsx
│   ├── Factor Details.xlsx
│   ├── GlobalFactors.Rproj
│   ├── MD
│   ├── README.md
│   ├── accounting_chars.sas
│   ├── char_macros.sas
│   ├── ind_identification.sas
│   ├── main.sas
│   ├── market_chars.sas
│   ├── portfolios.R
│   └── project_macros.sas
└── README.md

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
GlobalFactors/.Rproj.user
GlobalFactors/.Rhistory
GlobalFactors/.RData
GlobalFactors/.Ruserdata
Analysis/Data/
Analysis/Objects/
Analysis/Figures/
Analysis/.Rproj.user
Analysis/.Rhistory
Analysis/.RData
Analysis/.Ruserdata
.Rproj.user


================================================
FILE: Analysis/.gitignore
================================================
.Rhistory
.RData
.Rproj.user
Data/
Objects/
Figures/
Scribbles/


================================================
FILE: Analysis/0 - Functions.R
================================================
eb_prepare <- function(data, scale_alphas, overlapping) {
  if (overlapping) {
    data <- data %>%
      group_by(region, characteristic) %>%
      mutate(obs = n()) %>%
      ungroup() %>%
      filter(obs == max(obs)) %>%
      select(-obs)
  }
  # Adjust for Beta
  data <- data %>% 
    group_by(region, characteristic) %>%
    mutate(
      beta = cov(ret, mkt_vw_exc)/var(mkt_vw_exc),
      ret_neu = (ret - mkt_vw_exc * beta)*100,
      scaling_fct = sqrt(10^2/12) / sd(ret_neu),
      ret_neu_scaled = ret_neu * scaling_fct
    ) %>%
    ungroup()
  # Make Wide
  data <- data %>% mutate(name_wide = str_c(characteristic, "__", region))
  if(scale_alphas) {
    data_wide <- data %>%
      select(name_wide, eom, ret_neu_scaled) %>%
      spread(key = name_wide, value = ret_neu_scaled)
  } else {
    data_wide <- data %>%
      select(name_wide, eom, ret_neu) %>%
      spread(key = name_wide, value = ret_neu)
  }
  # Return
  list(
    "long" = data,
    "wide" = data_wide
  )
}

block_cluster_func <- function(cor_mat, cl_lables) {
  cor_long <- cor_mat %>%
    as_tibble(rownames = "char1") %>%
    gather(-char1, key = "char2", value = "cor") %>%
    separate(col = "char1", into = c("char1", "region1"), sep = "__") %>%
    separate(col = "char2", into = c("char2", "region2"), sep = "__") %>%
    left_join(cl_lables %>% select(characteristic, "hcl1"  = hcl_label), by = c("char1"="characteristic")) %>%
    left_join(cl_lables %>% select(characteristic, "hcl2"  = hcl_label), by = c("char2"="characteristic")) %>%
    rowwise() %>%
    mutate(
      hclreg1 = str_c(hcl1, "__", region1),
      hclreg2 = str_c(hcl2, "__", region2)
    ) %>%
    select(-hcl1, -hcl2) %>%
    mutate(hcl_pair = str_c(min(c(hclreg1, hclreg2)), "_x_", max(c(hclreg1, hclreg2)))) %>%
    unite(col = "name1", char1, region1, sep = "__", remove = T) %>%
    unite(col = "name2", char2, region2, sep = "__", remove = T) %>%
    ungroup()
  
  cluster_wise_cor <- cor_long %>%
    filter(name1 != name2) %>%  # Exclude cor(factor_i, factor_i)=1 
    group_by(hcl_pair) %>%
    summarise(
      cor_avg = mean(cor)
    ) 
  
  cluster_block_cor <- cor_long %>%
    left_join(cluster_wise_cor, by = "hcl_pair") %>% 
    mutate(cor_avg = if_else(name1 == name2, 1, cor_avg)) %>%  # ONLY IF SAME REGION!!
    select(name1, name2, cor_avg) %>%
    spread(key = name2, value = cor_avg)
  
  cbc_rows <- cluster_block_cor$name1  
  cluster_block_cor <- cluster_block_cor %>% select(-name1) %>% as.matrix()
  rownames(cluster_block_cor) <- cbc_rows
  return(cluster_block_cor)
}

# Empirical Bayes ----------------
emp_bayes <- function(data, cluster_labels, min_obs = 5 * 12, fix_alpha = F, bs_cov = F, cor_type = "sample", shrinkage = 0, layers = 3, bs_samples = 10000, seed, priors = NULL, sigma = NULL, plot = T) { # cor_type %in% c("raw", "block_2", "block_clusters")
  set.seed(seed)
  y_raw <- data$wide %>% select(-eom) %>% as.matrix()
  obs <- y_raw %>% apply(2, function(x) sum(!is.na(x)))
  y <- y_raw[, obs >= min_obs]
  n_fcts <- ncol(y)
  y_mean <- y %>% apply(2, mean, na.rm = T)  
  
  if (is.null(sigma)) {
    if (bs_cov) {
      bs_full <- y %>%
        rsample::bootstraps(times = bs_samples) %>%
        mutate(
          res = splits %>% map(~.x %>% rsample::analysis() %>% apply(2, mean, na.rm = T) %>% as_tibble(rownames = "characteristic"))
        ) %>%
        select(-splits) %>%
        unnest(res)
      
      bs_full_cov <- bs_full %>%
        spread(key = characteristic, value = value) %>%
        select(-id) %>%
        cov()
      
      alpha_sd <- sqrt(diag(bs_full_cov)) 
      alpha_cor <- solve(diag(alpha_sd)) %*% bs_full_cov %*% solve(diag(alpha_sd))
      colnames(alpha_cor) <- names(alpha_sd)
      rownames(alpha_cor) <- names(alpha_sd)
      
    } else {
      y_sd <- y %>% apply(2, sd, na.rm=T) 
      y_scor <- y %>% cor(use = "complete.obs")  
      alpha_sd <- y_sd / sqrt(nrow(y))
      alpha_cor <- y_scor
    }
    
    # Apply Shrinkage
    alpha_cor_shrunk <- alpha_cor * (1-shrinkage) + diag(n_fcts) * shrinkage
    
    # Correlation Block Adjustment
    if (cor_type == "sample") {
      alpha_cor_adj <- alpha_cor_shrunk
    }
    if (cor_type == "block_clusters") {
      alpha_cor_adj <- alpha_cor_shrunk %>% block_cluster_func(cl_lables = cluster_labels)
    }
    
    sigma <- diag(alpha_sd) %*% alpha_cor_adj %*% diag(alpha_sd)  # This is really the equivalent of sigma/T from the paper
    colnames(sigma) <- colnames(alpha_cor_shrunk)
    
    print(str_c("Condition Number: Raw = ", round(kappa(alpha_cor_shrunk), 2), ", Adjusted = ", round(kappa(alpha_cor_adj), 2)))
  } else {
    alpha_sd <- sqrt(diag(sigma))
    names(alpha_sd) <- colnames(sigma)
  }
  
  # Cluster Membership
  cm <- y_mean %>% 
    as_tibble(rownames = "char_reg") %>%
    mutate(
      characteristic = str_split(char_reg, "__", simplify = T)[, 1]
    ) %>%
    left_join(cluster_labels, by = "characteristic") 
  
  m <- cm %>%
    mutate(cm = 1) %>%
    select(char_reg, hcl_label, cm) %>%
    spread(key = hcl_label, value = cm) %>% 
    select(-char_reg) %>% 
    as.matrix()
  m[is.na(m)] <- 0
  mm <- m %*% t(m)
  
  n_cl <- ncol(m)
  
  # Signal Membership
  z <- cm %>%
    mutate(sm = 1) %>%
    select(char_reg, characteristic, sm) %>%
    spread(key = characteristic, value = sm) %>%
    select(-char_reg) %>%
    as.matrix()
  
  z[is.na(z)] <- 0
  zz <- z %*% t(z)
  
  n_s <- ncol(z)
  
  # Starting Values
  starting_values <- cm %>%
    group_by(hcl_label, characteristic) %>%
    summarise(
      n_s = n(),
      signal_mean = mean(value),
      signal_sd = sd(value)
    ) %>%
    group_by(hcl_label) %>%
    summarise(
      n_c = sum(n_s),
      cl_mean = mean(signal_mean),
      cl_sd = sd(signal_mean),
      cl_signal_within = mean(signal_sd)
    ) %>%
    ungroup() %>%
    mutate(cl_sd = if_else(n_c == 1, 0, cl_sd)) %>%
    summarise(
      alpha_mean = mean(cl_mean),  
      sd_cl_mean = if_else(condition = fix_alpha, sqrt(sum((cl_mean^2) / (n() - 1))), sd(cl_mean)),
      sd_within_cl = mean(cl_sd),
      sd_within_signal = mean(cl_signal_within)
    )
  
  if (fix_alpha) {
    sd_all <- sqrt(sum(y_mean^2) / (length(y_mean) - 1))
  } else {
    sd_all <- sd(y_mean)
  }
  
  # Maximum Likelihood
  omega_func <- function(layers, tau_c, tau_s, tau_w) {
    if (layers == 1) {
      a_omega <- diag(n_fcts) * tau_c^2                                        # All alphas are drawn from same distribution
    }
    if (layers == 2) {
      a_omega <- diag(n_fcts) * tau_s^2 + mm * tau_c^2                       # All cluster alphas are drawn from the same distribution, could be done with signals as well  
    }
    if (layers == 3) {
      a_omega <- diag(n_fcts) * tau_w^2 + zz * tau_s^2 + mm * tau_c^2      # Cluster distrib., signal distrib. factor distrib.
    }
    return(a_omega)
  }
  
  # Choose between specifying prior parameters or finding them via EB
  if (is.null(priors)) {
    if (layers == 1) {
      start_list <- list(
        a = starting_values$alpha_mean, 
        tc = sd_all)
      
      mle_func <- function(a, tc) {
        a_vec <- rep(a, n_fcts)
        a_omega <- omega_func(layers = layers, tau_c = tc, tau_s = NULL, tau_w = NULL)
        
        a_cov <- sigma + a_omega  #  / t_mat
        
        -(mvtnorm::dmvnorm(x = y_mean, mean = a_vec, sigma = a_cov, log = T))  #  + dgamma(param[2], 2, 5, log = T)*sum(mm)/2  + dgamma(param[3], 2, 10, log = T)*140 
      }
    } 
    if (layers == 2) {
      start_list <- list(
        a = starting_values$alpha_mean, 
        tc = starting_values$sd_cl_mean, 
        ts = starting_values$sd_within_cl)
      
      mle_func <- function(a, tc, ts) {
        a_vec <- rep(a, n_fcts)
        a_omega <- omega_func(layers = layers, tau_c = tc, tau_s = ts, tau_w = NULL)
        
        a_cov <- sigma + a_omega  #  / t_mat
        
        -(mvtnorm::dmvnorm(x = y_mean, mean = a_vec, sigma = a_cov, log = T))  
      }
    }
    if (layers == 3) {
      start_list <- list(
        a = starting_values$alpha_mean, 
        tc = starting_values$sd_cl_mean, 
        ts = starting_values$sd_within_cl,
        tw = starting_values$sd_within_signal)
      
      mle_func <- function(a, tc, ts, tw) {
        a_vec <- rep(a, n_fcts)
        a_omega <- omega_func(layers = layers, tau_c = tc, tau_s = ts, tau_w = tw)
        
        a_cov <- sigma + a_omega  
        
        -(mvtnorm::dmvnorm(x = y_mean, mean = a_vec, sigma = a_cov, log = T))
      }
    }
    
    # Maximum likelihood estimation
    for (k in 1:10) {
      initial_params <- start_list %>% lapply(function(x) max(x+rnorm(1, mean = 0, sd = 0.01), 0)) # Max is just to ensure that variances are not negative, never in use
      if (fix_alpha) {
        (hyper_pars <- stats4::mle(minuslogl = mle_func, start = initial_params, lower = c(-Inf, 0, 0, 0)[1:length(start_list)], fixed = list(a = 0)))
      } else {
        (hyper_pars <- stats4::mle(minuslogl = mle_func, start = initial_params, lower = c(-Inf, 0, 0, 0)[1:length(start_list)]))
      }
      if (hyper_pars@details$convergence==0) break
    }
    # Check convergence
    if (hyper_pars@details$convergence != 0) {
      warning("MLE step did not converge!!!")
      return(NULL)
    } 
    
    mu <- hyper_pars@fullcoef["a"]
    tau_c <- hyper_pars@fullcoef["tc"]
    tau_s <- hyper_pars@fullcoef["ts"]
    tau_w <- hyper_pars@fullcoef["tw"]
  } else {
    mu <- priors$alpha
    tau_c <- priors$tau_c
    tau_s <- priors$tau_s
    tau_w <- priors$tau_w
  }
  
  theta <- omega_func(layers = layers, tau_c = tau_c, tau_s = tau_s, tau_w = tau_w)
  colnames(theta) <- rownames(theta) <- names(y_mean)
  print(paste("Condition Number Omega =", round(kappa(theta))))
  
  # Signal Posteriors ------------------
  if (layers == 3) {
    as_mean <- tau_w^2*t(z) %*% (theta + sigma) %*% (y_mean - rep(mu, n_fcts))
    as_cov <- tau_w^2 * diag(n_s) - tau_w^4 * t(z) %*% (theta + sigma) %*% z
    as_sd <- sqrt(diag(as_cov))
    colnames(as_mean) <- "post_mean"
    
    signal_summary <- as_mean %>% as_tibble(rownames = "characteristic") %>%
      left_join(as_sd %>% as_tibble(rownames = "characteristic") %>% rename("post_sd" = value), by = "characteristic")
  }
  
  # Factor Posteriors ------------------
  ai_cov <- solve(solve(theta) + solve(sigma))  # t_mat * solve(sigma)
  ai_sd <- sqrt(diag(ai_cov))
  ai_mean <- ai_cov %*% (solve(theta) %*% rep(mu, n_fcts) + solve(sigma) %*% y_mean) ## (t_mat * solve(sigma))
  
  rownames(ai_mean) <- names(y_mean)
  colnames(ai_mean) <- "post_mean"
  names(ai_sd) <- names(y_mean)
  
  factor_summary <- ai_mean %>% as_tibble(rownames = "char_reg") %>%
    left_join(ai_sd %>% as_tibble(rownames = "char_reg") %>% rename("post_sd" = value), by = "char_reg") %>%
    left_join(y_mean %>% as_tibble(rownames = "char_reg") %>% rename("ols_est" = value), by = "char_reg") %>%
    left_join(alpha_sd %>% as_tibble(rownames = "char_reg") %>% rename("ols_se" = value), by = "char_reg") %>%
    mutate(
      characteristic = str_split(char_reg, "__", simplify = T)[, 1],
      # characteristic = char_reg %>% str_extract(".+[?=_{2}]") %>% str_remove("__")
      p025 = post_mean - 1.96 * post_sd,
      p975 = post_mean + 1.96 * post_sd
    ) %>%
    left_join(cluster_labels, by = "characteristic") %>%
    mutate(
      region = char_reg %>% str_extract(pattern = "(?<=_{2}).+")
    ) %>%
    select(char_reg, characteristic, hcl_label, region, everything()) 
  
  # Output
  if (is.null(priors)) {
    comparison <- tibble(
      estimate = c("alpha", "tau_c", "tau_s", "tau_w")[1:(layers + 1)],
      crude = drop(unlist(start_list)),
      ml_est = c(mu, tau_c, tau_s, tau_w)[1:(layers + 1)]
    )
    if (fix_alpha) {
      ml_se <- c(NA_real_, sqrt(diag(solve(hyper_pars@details$hessian))))
    } else {
      ml_se <- sqrt(diag(solve(hyper_pars@details$hessian)))
    }
    comparison$ml_se <- ml_se
    
    print(comparison)
  }
  
  if (plot == T) {
    list("factors" = factor_summary) %>% eb_plots()
  }
  
  ret_list <- list(
    "input" = data,
    # "clusters" = cluster_summary,
    "factors" = factor_summary,
    "factor_mean" = ai_mean,
    "factor_cov" = ai_cov,
    "theta" = theta,
    "sigma" = sigma
  )
  if (is.null(sigma)) {
    ret_list[["alpha_cor_raw"]] <- alpha_cor_shrunk
    ret_list[["alpha_cor_adj"]] <- alpha_cor_adj
  }
  if (is.null(priors)) {
    ret_list[["mle"]] <- comparison
  }
  if (layers == 3) {
    ret_list$signal <- signal_summary
  }
  return(ret_list)
}

fdr_sim <- function(t_low, a_vec, a_cov, n_sim = 10000, seed=1) {
  set.seed(seed)
  t_all <- a_vec / sqrt(diag(a_cov))
  t_steps <- sort(t_all[t_all > t_low])
  t_steps <- head(t_steps, -1)  # Don't include the last t-value (no significant)
  # Simulated alphas
  sims <- mvtnorm::rmvnorm(n = n_sim, mean = a_vec, sigma = a_cov)
  # False Discovery as a Function of t-cutoff
  t_steps %>% lapply(function(t) {
    # Significant alphas under t-cutoff
    sig <- (t_all >= t) 
    # False Discovery Rate
    sims_fdr <- rowMeans(sign(sims[, sig]) != sign(a_vec[sig]))
    # Output
    tibble(t_cutoff = t, n_sig = sum(sig), fdr = mean(sims_fdr), fwr = mean(sims_fdr > 0))
  }) %>% bind_rows()
}

fdr_fwer_rates <- function(t_cutoff, a_vec, a_cov, orig_sig = F, n_sim = 10000, seed=1) {
  set.seed(seed)
  # Simulate from full posterior
  sims <- mvtnorm::rmvnorm(n = n_sim, mean = a_vec, sigma = a_cov)
  if (orig_sig == T) {
    orig_factors <- char_info %>% filter(significance == T) %>% pull(characteristic) %>% str_c("__world")
    sims <- sims[, match(x = orig_factors, table = rownames(a_vec))]
    a_vec <- a_vec[orig_factors, ]
    a_cov <- a_cov[orig_factors, orig_factors]
  } 
  t_all <- a_vec / sqrt(diag(a_cov))
  sig <- (t_all >= t_cutoff) 
  sig_sims <- sims[, sig]
  false_discoveries <- sig_sims %>% apply(1, function(x) mean(x<0))
  # FDR Distribution
  fdr_dist <- tibble(
    min = min(false_discoveries),
    p025 = quantile(false_discoveries, 0.025),
    p50 = quantile(false_discoveries, 0.5),
    p975 = quantile(false_discoveries, 0.975),
    max = max(false_discoveries),
    mean = mean(false_discoveries),
    sd = sd(false_discoveries)
  )
  fwer_dist <- tibble(
    min = min(false_discoveries!=0),
    p025 = quantile(false_discoveries!=0, 0.025),
    p50 = quantile(false_discoveries!=0, 0.5),
    p975 = quantile(false_discoveries!=0, 0.975),
    max = max(false_discoveries!=0),
    mean = mean(false_discoveries!=0),
    sd = sd(false_discoveries!=0)
  )
  # FWER Distribution
  # Output 
  print(paste("Factors:", length(t_all), "- Sig:", sum(sig)))
  print(paste("Mean p-value:", round(mean(1-pnorm(t_all[sig])), 6)))
  fwer_fdr <- tibble(t_cutoff = t_cutoff, n_sig = sum(sig), fdr = mean(false_discoveries), fwer = mean(false_discoveries > 0))
  list("fdr_dist"=fdr_dist, "fwer_dist" = fwer_dist, "fwer_fdr"=fwer_fdr)
}

# True Factors 
true_factors <- function(t_cutoff, a_vec, a_cov, orig_sig = T, n_sim = 10000, seed=1) {
  set.seed(seed)
  post_vol <- sqrt(diag(a_cov))
  # Simulate using all factors
  sims <- mvtnorm::rmvnorm(n = n_sim, mean = a_vec, sigma = a_cov)
  # Decide which factors to look at
  if (orig_sig == T) {
    orig_factors <- char_info %>% filter(significance == T) %>% pull(characteristic) %>% str_c("__world")
    orig_factors_match <- match(x = orig_factors, table = rownames(a_vec))
    sims <- sims[, orig_factors_match]
    post_vol <- post_vol[orig_factors_match]
    a_vec <- a_vec[orig_factors_match]
  }
  sims <- 1:ncol(sims) %>% sapply(function(i) sims[, i] / post_vol[i])
  true_factors_dist <- sims %>% apply(1, function(x) mean(x > t_cutoff)) 
  true_stat <- mean(a_vec/post_vol > t_cutoff)
  # From point 4 in https://influentialpoints.com/Training/bootstrap_confidence_intervals-principles-properties-assumptions.htm
  bc_ci <- function(stat, bootstraps, alpha=0.05) { # bias corrected bootstrap standard errors
    # estimate bias in std. norm deviates
    b <- qnorm((sum(bootstraps > stat)+sum(bootstraps==stat)/2)/length(bootstraps))  # Proportion of bootstrap samples above the "population" estimate. If unbiased, b=0.5. To handle discrete statistics, half of the sample at the population estimate is assumed to lie above
    z <- qnorm(c(alpha/2,1-alpha/2)) # Std. norm. limits
    p <- pnorm(z-2*b) # bias-correct & convert to proportions
    
    quantile(bootstraps,p=p) # Bias-corrected percentile lims.
  } 
  bias_corrected <- bc_ci(stat = true_stat, bootstraps = true_factors_dist, alpha = 0.05)
  tibble(
    min = min(true_factors_dist),
    p025 = quantile(true_factors_dist, 0.025),
    p50 = quantile(true_factors_dist, 0.5),
    p975 = quantile(true_factors_dist, 0.975),
    max = max(true_factors_dist),
    mean = mean(true_factors_dist),
    sd = sd(true_factors_dist),
    p025_bc = bias_corrected[1],
    p975_bc = bias_corrected[2]
  )
}

# Simulations for figure 2
sim_mt_control <- function(sim_settings) {
  # Cluster membership
  m <- matrix(0, nrow = sim_settings$n, ncol = sim_settings$clusters)  # Cluster membership
  j <- 0
  for (i in 1:sim_settings$clusters) {
    m[(j+1):(j + sim_settings$fct_pr_cl), i] <- 1
    j <- j + sim_settings$fct_pr_cl
  }
  # Correlation Matrix
  corr_mat <- m %*% t(m)
  corr_mat[corr_mat == 0] <- sim_settings$corr_across
  corr_mat[corr_mat == 1] <- sim_settings$corr_within
  diag(corr_mat) <- 1
  # Sigma
  sigma <- sim_settings$se^2 * corr_mat
  # Predefine variables
  alpha_0_vec <- rep(sim_settings$alpha_0, sim_settings$n)
  i_n <- diag(sim_settings$n)
  # Simulation
  search_grid <- expand.grid("tau_c" = sim_settings$tau_c, "tau_w" = sim_settings$tau_w)
  1:nrow(search_grid) %>% lapply(function(i) {
    tau_c <- search_grid[i, "tau_c"]
    tau_w <- search_grid[i, "tau_w"]
    print(paste("Iteration", i, "out of", nrow(search_grid)))
    alpha_noise <- MASS::mvrnorm(n = sim_settings$n_sims, mu = rep(0, sim_settings$n), Sigma = sigma) # Preallocate alpha noise for efficiency
    s <- 1:sim_settings$n_sims %>% lapply(function(s) {
      omega <- m %*% t(m) * tau_c^2 + i_n * tau_w^2
      alpha_c <- rnorm(sim_settings$clusters) * tau_c
      alpha_w <- rnorm(sim_settings$n) * tau_w
      alpha_true <- alpha_0_vec + m %*% alpha_c + alpha_w
      alpha_hat <- alpha_true + alpha_noise[s, ]
      
      post_var <- solve(solve(omega) + solve(sigma))
      post_alpha <- post_var %*% (solve(omega) %*% alpha_0_vec + solve(sigma) %*% alpha_hat)
      
      eb <- tibble("type" = "eb", "true_alpha" = drop(alpha_true), "z" = drop(post_alpha / sqrt(diag(post_var))), "p" = 2 * pnorm(abs(z), lower.tail = F))
      ols <- tibble("type" = "ols", "true_alpha" = drop(alpha_true), "z" = drop(alpha_hat / sqrt(diag(sigma))), "p" = 2 * pnorm(abs(z), lower.tail = F))
      by <- tibble("type" = "by", "true_alpha" = drop(alpha_true), "z" = ols$z)
      by$p <- p.adjust(ols$p, method = "BY")
      rbind(eb, ols, by) %>%
        mutate(sig = z > 0 & p < 0.025) %>%
        group_by(type) %>%
        summarise(
          sim = s,
          n_disc = sum(sig),
          true_disc = sum(sign(true_alpha[sig == T]) == sign(z[sig == T])),
          false_disc = n_disc - true_disc
        )
    }) %>% bind_rows()
    s %>% 
      group_by(type) %>%
      mutate(fdp = if_else(n_disc == 0, 0, false_disc / n_disc)) %>%
      summarise(
        fdr = mean(fdp),
        n_disc = mean(n_disc),
        false_disc = mean(false_disc),
        true_disc = mean(true_disc),
        tau_c = tau_c,
        tau_w = tau_w,
        n = n()
      ) %>%
      mutate(true_disc_rate = true_disc / (sim_settings$n / 2))
  }) %>% bind_rows()
}

multiple_testing <- function(eb_all, eb_world = NULL) {
  eb_all$factors %>%
    bind_rows(eb_world$factors) %>%
    mutate(
      t_ols = ols_est/ols_se,
      p_ols = 2*pnorm(abs(t_ols), lower.tail = F)
    ) %>%
    group_by(region) %>%
    mutate(
      n = n(),
      p_bonf = p_ols %>% p.adjust(method = "bonferroni"),
      p_holm = p_ols %>% p.adjust(method = "holm"),
      p_bh = p_ols %>% p.adjust(method = "BH"),
      p_by = p_ols %>% p.adjust(method = "BY")
    ) %>%
    select(n, region, char_reg, "estimate" = ols_est, "statistic" = t_ols, "se" = ols_se, starts_with("p_")) %>%
    gather(starts_with("p_"), key = "method", value = "p") %>%
    mutate(
      method = method %>% str_remove("^p_"),
      mt_adj = case_when(
        method == "ols" ~ "None",
        method == "bh" ~ "FDR",
        method == "by" ~ "FDR",
        method == "bonf" ~ "FWR",
        method == "holm" ~ "FWR"
      ),
      method = case_when(
        method == "ols" ~ "OLS",
        method == "bh" ~ "BH",
        method == "by" ~ "BY",
        method == "bonf" ~ "Bonferroni",
        method == "holm" ~ "Holm",
        TRUE ~ method
      )
    ) 
}

# Bootstrap Tangency Portfolio --
# BS Func
bootstrap_tpf <- function(data, n_boots = 100, shorting = T, seed = 1) {
  set.seed(seed)
  if (shorting) {
    boot_func <- function(splits, ...) {
      df <- analysis(splits) %>% apply(2, function(x) x / sd(x)) %>% as.data.frame()
      lm(rep(1, nrow(df)) ~ -1 + ., data = df) %>% 
        broom::tidy() %>% 
        mutate(weight = estimate / sum(estimate)) %>% 
        mutate(term = term %>% str_remove_all("`")) %>%
        select(term, weight)
    }
  } else {
    boot_func <- function(splits, ...) {
      df <- analysis(splits) %>% apply(2, function(x) x / sd(x))
      glmnet::glmnet(y = rep(1, nrow(df)), x = df %>% as.matrix(), 
                     lambda = 0, lower.limits = 0, intercept = F) %>% 
        broom::tidy(return_zeros = T) %>% 
        filter(term != "(Intercept)") %>% 
        mutate(weight = estimate / sum(estimate)) %>%
        select(term, weight)
    }
  } 
  
  data %>% 
    bootstraps(times = n_boots, apparent = T) %>% # Apparent = T --> Generate original data 
    mutate(
      coef = splits %>% map(.f = boot_func)
    ) 
}

# Full tpf
tpf_cluster <- function(data, mkt_region, orig_sig, min_date, n_boots, shorting, seed) {
  if (orig_sig) {
    orig_sig_values <- T
  } else {
    orig_sig_values <- c(T, F)
  }
  market_ret <- regional_mkt_ret[region == mkt_region]
  
  cluster_pf <- data %>%
    left_join(cluster_labels, by = "characteristic") %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>%
    filter(orig_sig %in% orig_sig_values) %>%
    group_by(hcl_label, eom) %>%
    summarise(
      ret = mean(ret)
    )
  
  tpf_data <- cluster_pf %>% 
    filter(eom >= min_date) %>% 
    spread(key = hcl_label, value = ret) %>%
    left_join(market_ret %>% select(eom, market), by = c("eom")) %>%
    rename(Market = market)
  
  tpf_data %>% select(-eom) %>% bootstrap_tpf(n_boots = n_boots, shorting = shorting, seed = seed) %>% mutate(market_region = mkt_region)
}

# In-Sample / Out-of-Sample Functions
prepare_is_oos <- function(input, min_obs, orig_group, ret_scaled, type, print=F) {  # ret_scaled in ('none', "all", "is") & type in ('is_oos', 'is_post', 'is_pre')  
  data <- input %>%
    select(characteristic, eom, ret, mkt_vw_exc) %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig %in% orig_group) %>%
    left_join(char_info %>% select(characteristic, sample_start, sample_end), by = "characteristic") %>%
    mutate(
      period = case_when(
        year(eom) >= sample_start & year(eom) <= sample_end ~ "is",
        type == "pre" & year(eom) < sample_start ~ "oos",
        type == "post" & year(eom) > sample_end ~ "oos",
        type == "pre_post" & (year(eom) < sample_start | year(eom) > sample_end) ~ "oos"
      ),
      ret = ret * 100,
      mkt_vw_exc = mkt_vw_exc * 100
    ) %>%
    filter(!is.na(period))
  
  # Exclude data
  data_excl <- data %>%
    group_by(characteristic) %>%
    mutate(n_is = sum(period == "is"), n_oos = sum(period == "oos")) %>% 
    filter(n_is >= min_obs & n_oos >= min_obs)
  
  if (ret_scaled == "none") {
    data_adj <- data_excl %>% mutate(ret_adj = ret)
  }
  
  if (ret_scaled == "all") {
    data_excl <- data_excl %>% 
      group_by(characteristic, period) %>%
      mutate(
        ret_neu = ret - cov(ret, mkt_vw_exc)/var(mkt_vw_exc) * mkt_vw_exc,
        ret_adj = ret * (10/sqrt(12))/sd(ret_neu)) %>%
      select(-ret_neu) %>%
      ungroup()
  }
  if (ret_scaled == "is") {
    is_vol <- data_excl %>%
      filter(period == "is") %>%
      group_by(characteristic) %>%
      mutate(ret_neu = ret - cov(ret, mkt_vw_exc)/var(mkt_vw_exc) * mkt_vw_exc) %>%
      summarise(
        is_sd = sd(ret_neu)
      )
    
    data_excl <- data_excl %>% 
      left_join(is_vol, by = c("region", "characteristic")) %>%
      group_by(characteristic, period) %>%
      mutate(ret_adj = ret * (10/sqrt(12))/is_sd) %>%
      ungroup() %>%
      select(-is_sd)
  }
  
  full <- data %>% summarise(fct_all = uniqueN(characteristic))
  excl <- data_excl %>% summarise(fct_excl = uniqueN(characteristic))
  
  if (print) {
    print(tibble("type"=type, full, excl))
  }
  
  return(data_excl)
}

# Economic Benefit of more Power
trading_on_significance <- function(posterior_is) {
  pf_base <- posterior_is %>%
    left_join(char_info %>% select(characteristic, significance, sample_end), by = "characteristic") %>%
    filter(significance == 1 & est_date >= sample_end) %>%
    group_by(est_date) %>%
    mutate(
      ols_p = pnorm(abs(ols_est / ols_se), lower.tail = F)*2,
      by_p = p.adjust(ols_p, method = "BY")
    ) 
  
  print(pf_base %>%
          summarise(
            rr_eb = mean(p025 > 0),
            rr_ols = mean(ols_p <= 0.05 & ols_est > 0),
            rr_by = mean(by_p <= 0.05 & ols_est > 0),
          ) %>%
          gather(rr_eb, rr_ols, rr_by, key = "type", value = "rr") %>%
          ggplot(aes(est_date, rr, colour = type)) +
          geom_point() +
          geom_line())
  
  positions <- pf_base %>%
    ungroup() %>%
    mutate(
      position_year = year(est_date) + 1,
      eb_sig = (p025 > 0),
      by_sig = (by_p <= 0.05 & ols_est > 0)
    ) %>%
    select(position_year, characteristic, eb_sig, by_sig)
  
  candidate_factors <- regional_pfs %>%
    mutate(year = year(eom)) %>%
    left_join(positions, by = c("characteristic", "year" = "position_year")) %>%
    mutate(marg_sig = (eb_sig == T & by_sig == F)) %>%
    gather(marg_sig, eb_sig, by_sig, key = "type", value = "significant")
  
  candidate_factors %>%
    filter(significant == T) %>%
    group_by(region, type, significant, eom) %>%
    summarise(
      n = n(),
      ret = mean(ret),
      mkt = mean(mkt_vw_exc)
    )  
}

# Simulation according to specification for Harvey et al (2016)
harvey_et_al_sim <- function(sim_settings, seed) {
  set.seed(seed)
  # Cluster membership
  m <- matrix(0, nrow = sim_settings$n, ncol = sim_settings$cl)  # Cluster membership
  j <- 0
  for (i in 1:sim_settings$cl) {
    m[(j+1):(j + sim_settings$fct_pr_cl), i] <- 1
    j <- j + sim_settings$fct_pr_cl
  }
  mm <- m %*% t(m)
  # Correlation Matrix
  corr_mat <- mm
  corr_mat[corr_mat == 0] <- sim_settings$corr_across
  corr_mat[corr_mat == 1] <- sim_settings$corr_within
  diag(corr_mat) <- 1
  
  # Average Correlation (Should be close to zero)
  mean(corr_mat[lower.tri(corr_mat)])
  
  # Sigma
  sigma <- sim_settings$se^2 * corr_mat
  
  # Predefine variables
  alpha_0_vec <- rep(sim_settings$alpha_0, sim_settings$n)
  i_n <- diag(sim_settings$n)
  
  # Simulation
  sim_settings$tau_ws %>% lapply(function(tau_w) {
    start <- proc.time()
    alpha_noise <- MASS::mvrnorm(n = sim_settings$n_sims, mu = rep(0, sim_settings$n), Sigma = sigma) # Preallocate alpha noise for efficiency
    tau_sim <- 1:sim_settings$n_sims %>% sapply(simplify = F, USE.NAMES = T, function(s) {
      print(paste("Tau_w:", tau_w, "- Simulation", s, "out of", sim_settings$n_sims))
      # Simulate Alphas
      alpha_c <- c(rep(sim_settings$ret, times = sim_settings$cl_true), rep(0, times = (sim_settings$cl - sim_settings$cl_true)))
      alpha_w <- c(rnorm(sim_settings$n_true) * tau_w, rep(0, sim_settings$n - sim_settings$n_true))
      alpha_true <- alpha_0_vec + m %*% alpha_c + alpha_w
      alpha_hat <- as.vector(alpha_true + alpha_noise[s, ])
      
      # MLE Function
      mle_func <- function(a, tc, tw) {
        a_vec <- rep(a, sim_settings$n)
        a_omega <- i_n * tw^2 + mm * tc^2
        a_cov <- sigma + a_omega  #  / t_mat
        
        -(mvtnorm::dmvnorm(x = alpha_hat, mean = a_vec, sigma = a_cov, log = T))  
      }
      
      # Starting Values
      starting_values <- tibble(a = alpha_hat, cl = rep(1:sim_settings$cl, each = sim_settings$fct_pr_cl)) %>%
        group_by(cl) %>%
        summarise(
          cl_mean = mean(a),
          cl_sd = sd(a)
        ) %>%
        summarize(
          crude_a0 = mean(cl_mean),
          crude_tc = if_else(sim_settings$fix_alpha, sqrt(sum((cl_mean^2) / (n() - 1))), sd(cl_mean)),
          crude_tw = mean(cl_sd)
        )
      start_list <- list(
        a = starting_values$crude_a0, 
        tc = starting_values$crude_tc, 
        tw = starting_values$crude_tw)
      
      # Estimate Parameters
      if (sim_settings$fix_alpha) {
        (hyper_pars <- stats4::mle(minuslogl = mle_func, start = start_list, lower = c(-Inf, 0, 0), fixed = list(a = 0)))
      } else {
        (hyper_pars <- stats4::mle(minuslogl = mle_func, start = start_list, lower = c(-Inf, 0, 0)))
      }
      
      # Check convergence
      if (hyper_pars@details$convergence != 0) {
        warning("MLE step did not converge!!!")
        return(NULL)
      }
      mu <- hyper_pars@fullcoef["a"]
      tc <- hyper_pars@fullcoef["tc"]
      tw <- hyper_pars@fullcoef["tw"]
      mle <- tibble(
        s = rep(s, 3),
        coef = c("a", "tc", "tw"),
        mle = c(mu, tc, tw),
        crude = c(start_list$a, start_list$tc, start_list$tw)
      )
      print(mle)
      
      # Specify Posterior
      omega <- i_n * tw^2 + mm * tc^2
      post_cov <- solve(solve(omega) + solve(sigma))
      post_alpha <- post_cov %*% (solve(omega) %*% alpha_0_vec + solve(sigma) %*% alpha_hat)
      
      list("alpha_true" = alpha_true, "alpha_hat" = alpha_hat, "post_alpha" = post_alpha, "post_cov" = post_cov, "mle" = mle)
    })
    print(proc.time() - start)  # 5 iterations took 232.13/60 = 4 minutes  
    return(tau_sim)
  }) 
}


# Single Factor TP --
sr_func <- function(data, w) {
  ret_vec <- data %>% colMeans()
  cov_mat <- data %>% cov()
  drop(w %*% ret_vec / sqrt(t(w) %*% cov_mat %*% w))
}

epo_tpf <- function(data, s) {
  sd <- data %>% apply(2, sd)
  cor <- data %>% cor()
  ret_vec <- data %>% colMeans()
  cor_shrunk <- diag(length(sd)) * s + cor * (1-s)
  cov_shrunk <- diag(sd) %*% cor_shrunk %*% diag(sd)
  drop((solve(cov_shrunk) %*% ret_vec) / sum(solve(cov_shrunk) %*% ret_vec))
}

# Prepare data for 
prepare_tpf_factors <- function(region, orig_sig_values, start, scale) {
  mkt <- regional_mkt_ret %>% filter(region == !!region) %>% select(-region)
  
  tpf_factors <- eb_est[[region]]$input$long %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>%
    filter(orig_sig %in% orig_sig_values) %>%
    group_by(eom) %>%
    mutate(n = n()) %>%
    ungroup() %>%
    filter(eom >= start) %>%
    select(characteristic, eom, n, ret)
  
  # Ensure that all factors have data
  missing <- tpf_factors %>% filter(n != max(n)) %>% select(eom, n) %>% distinct()
  if (nrow(missing) > 0) {
    warning("UNBALANCED PANEL - SOME FACTORS ARE MISSING DATA!")
  }
  tpf_factors <- tpf_factors %>% filter(n == max(n)) %>% select(-n)  
  
  # 1. Overall TPF
  tpf_factors <- tpf_factors %>%
    bind_rows(mkt %>% rename("ret"="market") %>% mutate(characteristic = "market") %>% filter(eom %in% tpf_factors$eom)) 
  
  if (scale) {
    tpf_factors <- tpf_factors %>%
      group_by(characteristic) %>%
      mutate(ret = ret * (0.1 / sqrt(12)) / sd(ret))
  }
  
  tpf_factors_wide <- tpf_factors %>%
    pivot_wider(names_from = characteristic, values_from = ret) %>%
    select(-eom) 
  # Output
  list("long"=tpf_factors, "wide" = tpf_factors_wide)
}

# Optimal Shrinkage
optimal_shrinkage <- function(data, k, epo_range = seq(0, 1, 0.1)) {
  finance_kfold <- function(dates, k, horizon) {
    # Helper Function
    helper_eom_seq <- function(ends, horizon) {
      all <- ends %>% lapply(function(d) seq.Date(from = ceiling_date(d, unit = "month") - months(horizon - 1), to = ceiling_date(d, unit = "month"), by = "1 month") - 1)
      all_unique <- do.call(c, all) %>% unique()
    }
    # Split Dates into k groups
    date_vec <- dates %>% unique() %>% sort()
    n <- length(date_vec)
    n_fold <- floor(length(date_vec) / k)
    split <- split(date_vec, cut(seq_along(date_vec), k, labels = FALSE))
    # Create train/test split
    1:k %>% lapply(function(i) {
      test_ends <- split[[i]]
      test_dates <- test_ends %>% helper_eom_seq(horizon = horizon)
      train_ends <- do.call(c, split[-i])
      train_dates <- train_ends %>% helper_eom_seq(horizon = horizon)
      # Avoid Data Leakage
      train_dates <- train_dates[!(train_dates %in% test_dates)]
      tibble(fold = i, train = list(train_dates), test = list(test_dates))
    }) %>%
      bind_rows()
  }
  
  date_split <- unique(data$eom) %>% finance_kfold(k = k, horizon = 1)
  cross_val <- 1:k %>% lapply(function(i)  {
    test_dates <- date_split$test[[i]]
    test <- data %>%
      filter(eom %in% test_dates) %>%
      pivot_wider(names_from = characteristic, values_from = ret) 
    test_eom <- test$eom
    test <- test %>% select(-eom)
    
    train_dates <- date_split$train[[i]]
    train <- data %>%
      filter(eom %in% train_dates) %>%
      pivot_wider(names_from = characteristic, values_from = ret) %>%
      select(-eom) 
    
    # Create Weights
    nonneg_w <- glmnet::glmnet(y = rep(1, nrow(train)), x = train %>% as.matrix(), 
                               lambda = 0, lower.limits = 0, intercept = F) %>% 
      tidy(return_zeros = T) %>% 
      filter(term != "(Intercept)") %>% 
      mutate(
        type = "Non-Negative",
        weight = estimate / sum(estimate)
      ) %>%
      select(type, weight)
    
    epo_w <- epo_range %>% lapply(function(s) {
      epo <- train %>% epo_tpf(s = s)
      tibble(type = paste0("EPO s=", s), weight = epo)
    })
    
    # OOS Performance
    c(list(nonneg_w), epo_w) %>% lapply(function(x) {
      w <- x$weight
      tibble(
        type = unique(x$type),
        eom = test_eom, 
        ret = drop(as.matrix(test) %*% w)
      )
    }) %>%
      bind_rows() %>%
      mutate(
        i = i,
        test_range = paste0(year(min(test_dates)), "-", year(max(test_dates)))
      )
  }) %>% bind_rows()
  
  cross_val_summary <- cross_val %>%
    group_by(type) %>%
    summarise(
      ann_ret = mean(ret),
      sd = sd(ret),
      sr = ann_ret/sd
    ) %>%
    mutate(
      type_overall = if_else(str_detect(type, "EPO"), "EPO", "Non-Negative"),
      type = if_else(type == "EPO s=0", "Unconstrained", type),
      type = type %>% factor(levels = c("Non-Negative", "Unconstrained", paste0("EPO s=", seq(0.1, 1, 0.1))))
    ) 
  
  print(cross_val_summary %>%
          ggplot(aes(type, sr, group=type_overall)) +
          geom_point() +
          geom_path() +
          labs(colour = "Test Period:", y = "Monthly OOS SR of TPF") +
          theme(legend.position = "top", axis.title.x = element_blank(), axis.text.x = element_text(angle = 45, vjust=0.5)))
  
  opt_s_summary <- cross_val_summary %>% filter(type_overall == "EPO" & sr == max(sr)) %>% mutate(s = type %>% str_remove("EPO s=") %>% as.numeric()) 
  print(paste0("Highest OOS SR: ", opt_s_summary %>% pull(sr) %>% round(2), ", Standard MVO: ", cross_val_summary %>% filter(type == "Unconstrained") %>% pull(sr) %>% round(2)))
  
  # Optimal shrinkage
  opt_s_summary %>% pull(s)
}


# Table Functions --------------------------------------------------------
table_is_oos_ols <- function(is_oos_regs, is_post_regs) {
  oos_us <- lm(oos ~ is, data = is_oos_regs %>% filter(region == "us"))
  post_us <- lm(post ~ is, data = is_post_regs %>% filter(region == "us"))
  
  stargazer::stargazer(
    post_us, oos_us,
    title = "OLS - Biased: $\\hat{\\alpha}_\\text{Out-of-Sample} = \\gamma_0 + \\gamma_1\\times\\hat{\\alpha}_\\text{In-Sample}$", out.header=T,
    no.space=T, digits=3, type='latex', single.row=F,
    align = T, notes.align = "l",
    omit.stat = c("adj.rsq", "f", "ser"),
    covariate.labels = c("Intercept", "$\\hat{\\alpha}_\\text{IS}$"), dep.var.caption = "Dependent Variable:", dep.var.labels = c("$\\hat{\\alpha}_\\text{Post IS}$", "$\\hat{\\alpha}_\\text{Pre \\& Post IS}$"),
    notes.append=T, report = "vc*t", intercept.bottom = F)
}

table_is_oos_nls <- function(nls_post, nls_oos) {
  info <- list(
    "post" = list(
      "dep" = "post",
      "fit" = nls_post
    ),
    "oos" = list(
      "dep" = "oos",
      "fit" = nls_oos
    )
  ) 
  
  op <- info %>% sapply(simplify = F, USE.NAMES = T, function(x) {
    # Create Fake Linear Model
    fake_data <- tibble(y = rnorm(10), k0 = rnorm(10), kh = rnorm(10))
    colnames(fake_data) <- c(x$dep, "k0", "kh")
    lm_string <- paste0(x$dep, "~k0 + kh -1")
    fake_lm = lm(lm_string, data = fake_data)
    fake_x = c("k0", "kh")
    
    # Generate various parts for output
    sum_xx = summary(x$fit$nls_fit)
    mat_xx = sum_xx$coefficients[1:2, ]
    colnames(mat_xx) = c("coef","se", "t", "p")
    indVarNames = rownames(mat_xx)
    
    # Generate coefficients, se, t-stat and p values 
    df_xx = as.data.frame(mat_xx)
    vCoef = df_xx$coef; names(vCoef)=fake_x
    vSE = df_xx$se; names(vSE)=fake_x
    vT = df_xx$t; names(vT)=fake_x
    vP = df_xx$p; names(vP)=fake_x
    
    formulaTxt = sum_xx$formula
    nParameters = sum_xx$df[1]
    nDF = sum_xx$df[2]
    obs <- length(x$fit$nls_fit$m$resid())
    n_fcts <- uniqueN(x$fit$nls_data$c)
    rss = round(sum_xx$sigma, 3)
    convTolerance = x$fit$nls_fit$m$conv()
    list("lm"=fake_lm, "coef" = vCoef, "se" = vSE, "t" = vT, "p" = vP, "rss" = rss, "obs" = obs, "n_fcts" = n_fcts)
  })
  
  # Determine order
  y1 <- "post"
  y2 <- "oos"
  
  # Aesthetics 
  vTitle = "NLS - Unbiased: $R_{i,t} = \\alpha_i + (\\kappa_0 + \\kappa_h \\times \\alpha_i)\\times 1_\\text{\\{Out-of-Sample\\}}$"
  vType = "latex"
  # v_col_label = c("USA", "Developed", "Emerging")
  lines_obs <- c("Observations", 
                 sprintf("\\multicolumn{1}{r}{%s}", prettyNum(op[[y1]]$obs, big.mark = ",")), 
                 sprintf("\\multicolumn{1}{r}{%s}", prettyNum(op[[y2]]$obs, big.mark = ",")))
  lines_fcts <- c("Factors", sprintf("\\multicolumn{1}{r}{%s}", op[[y1]]$n_fcts), sprintf("\\multicolumn{1}{r}{%s}", op[[y2]]$n_fcts))
  dep_var_options <- list("post" = "Post IS", "oos" = "Pre \\& Post IS")
  dep_var_lbls <- c(dep_var_options[[y1]], dep_var_options[[y2]])
  
  # Output
  stargazer::stargazer(
    op[[y1]]$lm, op[[y2]]$lm,
    title = vTitle, out.header=T,
    no.space=T, digits=3, type=vType, single.row=F,
    align = T, notes.align = "l",
    omit.stat = c("rsq","adj.rsq", "f", "n", "ser"),
    covariate.labels = c("$\\kappa_0$", "$\\kappa_h$"), dep.var.labels.include = T, dep.var.caption = "OOS Period:", dep.var.labels = dep_var_lbls,
    add.lines=list(lines_fcts, lines_obs), report = "vc*t", intercept.bottom = F,
    # notes=vNotes, notes.append=T,
    coef=list(op[[y1]]$coef, op[[y2]]$coef), se=list(op[[y1]]$se, op[[y2]]$se), t=list(op[[y1]]$t, op[[y2]]$t), p=list(op[[y1]]$p, op[[y2]]$p)
  )
}

table_taus <- function(){
  taus <- list(
    list("USA", "us"),
    list("Developed", "developed"),
    list("Emerging", "emerging"),
    list("USA, Developed & Emerging", "all"),
    list("World", "world"),
    list("World ex. US", "world_ex_us"),
    list("USA - Mega", "us_mega"),
    list("USA - Large", "us_large"),
    list("USA - Small", "us_small"),
    list("USA - Micro", "us_micro"),
    list("USA - Nano", "us_nano")
  ) %>% lapply(function(x) {
    eb_est[[x[[2]]]]$mle %>% 
      select(estimate, ml_est) %>%
      spread(key = estimate, value = ml_est) %>%
      mutate(sample = x[[1]])
  }) %>% 
    bind_rows() %>%
    select(sample, tau_c, tau_s, tau_w)
  
  tau_cap <- paste(
    "The table shows the tau parameters estimated by maximum likelihood.",
    "$\\tau_c$ is the estimated dispersion in cluster alphas.",
    "$\\tau_w$ is the estimated dispersion in factor alphas with a cluster.",
    "$\\tau_s$ is the estimated dispersion in alpha of the same factor in different regions."
  )
  
  taus %>%
    select("Sample" = sample, "$\\tau_c$" = tau_c, "$\\tau_w$" = tau_s, "$\\tau_s$" = tau_w) %>% # Here I use the notation from eq 23
    xtable(auto=T, digits = 2, caption = tau_cap) %>% 
    print(include.rownames = F, caption.placement = "top", sanitize.colnames.function = identity) 
}

# Table - Factor Performance
table_factor_info <- function() {
  table <- eb_est$all$factors %>%
    mutate(p_zero = pnorm(q = 0, mean = post_mean, sd = post_sd)) %>%
    select(characteristic, region, ols_est, "eb_est" = post_mean, p_zero) %>% 
    pivot_wider(names_from = region, values_from = c(ols_est, eb_est, p_zero), names_sep = "_") %>%
    select(characteristic, ends_with("_us"), ends_with("_developed"), ends_with("_emerging")) %>%
    left_join(char_info %>% select(characteristic, significance), by = "characteristic") %>%
    mutate(char_name = if_else(significance == 0, paste0(characteristic, "*"), characteristic)) %>%
    select(-characteristic, -significance) %>%
    select(char_name, everything()) %>%
    arrange(ols_est_us) %>%
    as.data.frame() 
  
  data.frame(table[, 1:4], "empty1" = rep("", nrow(table)), table[, 5:7], "empty2" = rep("", nrow(table)), table[, 8:10]) %>%
    xtable() %>%
    print()
}

table_economic_benefit <- function(sig_pfs) {
  sig_regs <- c("us", "developed", "emerging") %>% lapply(function(x) {
    fit <- lm(ret ~ mkt, data = sig_pfs %>% filter(type == "marg_sig" & region == x) %>% mutate(ret = ret*100, mkt = mkt*100)) # Ensures alpha is in Percentages
    nw <- fit %>% lmtest::coeftest(vcov = sandwich::NeweyWest(fit, lag = 6))
    list("fit"=fit, "nw"=nw)
  })
  
  lines_obs <- c("Observations", 
                 sprintf("\\multicolumn{1}{r}{%s}", prettyNum(length(sig_regs[[1]]$fit$residuals), big.mark = ",")), 
                 sprintf("\\multicolumn{1}{r}{%s}", prettyNum(length(sig_regs[[2]]$fit$residuals), big.mark = ",")), 
                 sprintf("\\multicolumn{1}{r}{%s}", prettyNum(length(sig_regs[[3]]$fit$residuals), big.mark = ",")))
  lines_r2 <- c("Adjusted $R^2$", 
                sprintf("\\multicolumn{1}{r}{%s}", formatC(summary(sig_regs[[1]]$fit)$adj.r.squared, digits = 2, format = "f")), 
                sprintf("\\multicolumn{1}{r}{%s}", formatC(summary(sig_regs[[2]]$fit)$adj.r.squared, digits = 2, format = "f")), 
                sprintf("\\multicolumn{1}{r}{%s}", formatC(summary(sig_regs[[3]]$fit)$adj.r.squared, digits = 2, format = "f")))
  
  stargazer::stargazer(sig_regs[[1]]$nw, sig_regs[[2]]$nw, sig_regs[[3]]$nw, dep.var.labels.include = F, dep.var.caption = "Region",
                       no.space = F, intercept.bottom = F, report = "vc*t", column.labels = c("US", "Developed ex. US", "Emerging"),
                       add.lines=list(lines_obs, lines_r2), covariate.labels = c("Alpha", "Market Beta"), align=T, digits=2)
}



# PLOT FUNCTIONS ---------------------------------------------------------
cluster_val <- function(cor, labels, op_format = "pdf") {
  pairwise_cor <- cor %>%
    as_tibble(rownames = "char1") %>%
    gather(-char1, key = "char2", value = "cor") %>%
    left_join(select(labels, characteristic, "label1" = hcl_label), by = c("char1"="characteristic")) %>%
    left_join(select(labels, characteristic, "label2" = hcl_label), by = c("char2"="characteristic")) %>%
    filter(char1 != char2) %>% 
    mutate(hcl_pair = str_c(label1, "_", label2)) %>%
    group_by(hcl_pair) %>%
    summarise(
      n = n(),
      cor_avg = mean(cor)
    ) %>%
    ungroup() %>%
    separate(hcl_pair, c("hcl1", "hcl2"), sep = "_") %>%
    select(hcl1, hcl2, cor_avg) %>%
    spread(key = hcl2, cor_avg) 
  
  pairwise_cor_names <- pairwise_cor$hcl1
  pairwise_cor <- pairwise_cor %>% select(-hcl1) %>% as.matrix()
  rownames(pairwise_cor) <- pairwise_cor_names
  
  # Needs to Be saved as a functional
  if (op_format == "tex") {
    corrplot_cex <- list(tl = 0.8, number = 0.5)
  }
  if (op_format == "pdf") {
    corrplot_cex <- list(tl = 0.7, number = 0.45)
  }
  function() {
    par(xpd=TRUE)
    pairwise_cor %>%
      corrplot::corrplot(method = "color", addCoef.col = "black", type = "lower", mar = c(0, 0, 3, 0), tl.cex = corrplot_cex$tl,
                         number.cex = corrplot_cex$number, tl.col = "black", col = colorRampPalette(c(colours_theme[2], "white", colours_theme[1]))(200))
  }
}
plot_mt_eb_comp <- function(mt, eb_all, eb_us = NULL, eb_developed = NULL, eb_emerging = NULL, eb_world = NULL, mts = c("OLS", "Bonferroni", "BY"), regs = c("us", "developed", "emerging", "world"), se_methods, se_regions) {
  mt_sub <- mt %>%
    mutate(method = method %>% factor(levels = c("OLS", "Bonferroni", "Holm", "BH", "BY", "EB - Region", "EB - Full"))) %>%
    filter(method %in% mts & region %in% regs)
  
  (t_cutoff <- mt_sub %>% 
      group_by(method, region) %>%
      summarise(
        t_cut = (min(abs(statistic)[p < 0.05]) + max(abs(statistic)[p > 0.05])) / 2
      ))
  
  eb_comb <- bind_rows(
    eb_all$factors %>% mutate(method = "EB - All"),
    eb_us$factors %>% mutate(method = "EB - Region"),
    eb_developed$factors %>% mutate(method = "EB - Region"),
    eb_emerging$factors %>% mutate(method = "EB - Region"),
    eb_world$factors %>% mutate(method = "EB - Region")
  ) %>% mutate(
    method = method %>% factor(levels = c("OLS", "Bonferroni", "Holm", "BH", "BY", "EB - Region", "EB - All"))
  )
  
  mt_table <- mt_sub %>%
    mutate(characteristic = char_reg %>% str_remove("__.+")) %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig == T) %>%
    group_by(region, method, mt_adj) %>%
    summarise(
      n = n(),
      significant = mean(p < 0.05 & estimate > 0), # Estimates also needs to be positive
      max_t_insig = max(abs(statistic[p > 0.05]))
    ) 
  
  eb_table <- eb_comb %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig == T) %>%
    mutate(ols_t = ols_est / ols_se) %>%
    group_by(region, method) %>%
    summarise(
      mt_adj = "Bayesian",
      n = n(),
      significant = mean(p025 > 0),
      max_t_insig = max(abs(ols_t[p025 < 0 & p975 > 0]))
    )
  
  (comp_table <- mt_table %>% 
      bind_rows(eb_table)) 
  
  # To install drlib put: devtools::install_github("dgrtwo/drlib")
  if (FALSE) {
    repl_plot <- comp_table %>%
      mutate(
        region_pretty = case_when(
          region == "us" ~ "US",
          region == "developed" ~ "Developed Ex. US",
          region == "emerging" ~ "Emerging",
          region == "world" ~ "World"
        ),
        region_pretty = region_pretty %>% factor(levels = c("US", "Developed Ex. US", "Emerging", "World"))
      ) %>%
      filter((region == "world" & method == "EB - All") | region != "world") %>%
      filter(method != "Bonferroni") %>%
      # filter(!(region == "world" & method != "Empirical Bayes")) %>%
      ggplot(aes(drlib::reorder_within(method, significant, region_pretty), significant, fill = method)) +
      geom_col() +
      drlib::scale_x_reordered() +
      geom_text(aes(label = str_c(formatC(round(significant * 100, 2), digits = 2, format = "f"), "%")), nudge_y = 0.025, size = 3) +
      facet_wrap(~region_pretty, nrow = 1, scales = "free_x") +
      labs(x = "Method", fill = "Multiple Testing Adj.", y = "Replication Rate (%)") +
      theme(legend.position = "none")
  }
  
  repl_plot <- comp_table %>%
    # filter((region == "world" & method == "EB - All") | region != "world") %>%
    filter(method != "Bonferroni") %>%
    group_by(method) %>%
    # mutate(sort_var = significant[region == "us"]) %>%
    mutate(
      region_pretty = case_when(
        region == "us" ~ "US",
        region == "developed" ~ "Developed Ex. US",
        region == "emerging" ~ "Emerging",
        region == "world" ~ "World"
      ),
      region_pretty = region_pretty %>% factor(levels = c("US", "Developed Ex. US", "Emerging", "World")),
      method_pretty = case_when(
        method == "BY" ~ "Benjamini-Yekutieli",
        method == "EB - Region" ~ "Empirical Bayes - Region",
        method == "EB - All" ~ "Empirical Bayes - All",
        method == "OLS" ~ "OLS"
      ),
      method_pretty = method_pretty %>% factor(levels = c("OLS", "Benjamini-Yekutieli", "Empirical Bayes - Region", "Empirical Bayes - All"))
    ) %>%
    ggplot(aes(method_pretty, significant*100, fill = method_pretty)) + #reorder(method_pretty, sort_var)
    geom_col() +
    geom_text(aes(label = str_c(formatC(round(significant * 100, 1), digits = 1, format = "f"), "%")), nudge_y = 2.5, size = 3) +
    facet_grid(. ~ region_pretty, scales = "free", space='free') +
    # facet_wrap(~region_pretty, nrow = 1, scales = "free_x") +
    labs(x = "Method", fill = "Multiple Testing Adj.", y = "Replication Rate (%)") +
    theme(legend.position = "none", axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 10), axis.title.x = element_blank())
  
  
  eb_overview <- eb_comb %>% 
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    mutate(
      t_cut = 1.96,
      type = case_when(
        p025 > 0 & orig_sig == 1 ~ "Replicated",
        p025 <= 0 & orig_sig == 1 ~ "Not Replicated",
        orig_sig == 0 ~ "Never Significant"
      )
    ) %>% 
    select(region, method, type, char_reg, estimate = post_mean, t_cut, se = post_sd)
  
  
  
  mt_plot <- mt_sub %>%
    mutate(characteristic = char_reg %>% str_remove("__.+")) %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    left_join(t_cutoff, by = c("region", "method")) %>%
    mutate(
      # significant = if_else(p < 0.05 & estimate > 0, "Significant", "Insignificant"),
      type = case_when(
        p <= 0.05 & estimate > 0 & orig_sig == 1 ~ "Replicated",
        (p > 0.05 | estimate <= 0) & orig_sig == 1 ~ "Not Replicated",
        orig_sig == 0 ~ "Never Significant"
      )
    ) %>%
    bind_rows(eb_overview) %>%
    filter(region %in% se_regions & method %in% se_methods) %>%
    mutate(
      characteristic = char_reg %>% str_extract(".+[?=__]") %>% str_remove("__"),
      type = type %>% factor(levels = c("Replicated", "Not Replicated", "Never Significant")),
      method_pretty = case_when(
        method == "BY" ~ "Multiple Testing - Benjamini-Yekutieli",
        method == "EB - Region" ~ "Empirical Bayes - US",
        method == "EB - All" ~ "Empirical Bayes - Global",
        method == "OLS" ~ "OLS"
      ),
      method_pretty = method_pretty %>% factor(levels = c("OLS", "Multiple Testing - Benjamini-Yekutieli", "Empirical Bayes - US", "Empirical Bayes - Global"))
    ) %>%
    group_by(characteristic) %>%
    # mutate(sort_var = statistic[method == "OLS" & region == "us"]) %>%
    mutate(sort_var = estimate[method == "OLS" & region == "us"]) %>%
    group_by(region, method) %>%
    mutate(
      ols_rank = frank(sort_var),
      repl_rate = sum(type == "Replicated") / sum(type %in% c("Replicated", "Not Replicated"))
    ) %>%
    ggplot(aes(reorder(ols_rank, sort_var), estimate, colour = type, linetype = type)) +
    geom_point() +
    geom_text(aes(x = 35, y = 1.45, label = str_c("Replication Rate: ", formatC(round(repl_rate*100, 1), digits = 1, format = "f"), "%")), 
              colour = "black", size = 3, check_overlap = T) +
    geom_errorbar(aes(ymin = estimate - t_cut * se, ymax = estimate + t_cut * se)) +
    facet_wrap(~method_pretty, ncol = length(se_methods) / 2) + 
    coord_cartesian(ylim = c(-1, 1.5)) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    guides(colour = guide_legend(override.aes = list(shape = NA))) + 
    labs(y = "Monthly Alpha (%)") +
    theme(
      axis.title.x = element_blank(),
      axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
      # text = element_text(size = 13),
      legend.title = element_blank(),
      legend.position = "top"
    )
  
  list("mt" = mt_plot, "repl" = repl_plot)
}

plot_fdr <- function(simulated_fdr) {
  simulated_fdr %>%
    gather(fdr, fwr, key = "type", value = "rate") %>%
    mutate(type = type %>% str_to_upper()) %>%
    ggplot(aes(t_cutoff, rate, colour = type)) +
    geom_point() +
    geom_hline(yintercept = 0.05, linetype = "dashed") + 
    geom_vline(xintercept = 1.96, linetype = "dotted") +
    scale_y_continuous(breaks = c(0, 0.05, 0.25, 0.5, 0.75, 1)) +
    scale_x_continuous(breaks = c(0, 1.96, 2.5, 5.0, 7.5, 10)) +
    labs(x = "Critical Value (t)", y = "Rate", colour = "Type:") +
    theme(
      legend.position = "top"
    )
}

plot_factor_post <- function(eb, orig_sig, cluster_order) {
  if (orig_sig) {
    orig_sig_values <- T
  } else {
    orig_sig_values <- c(T, F)
  }
  
  eb$factors %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>%
    filter(orig_sig %in% orig_sig_values) %>%
    group_by(hcl_label) %>%
    mutate(
      sort_var = median(post_mean) + post_mean / 1000000
    ) %>%
    mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>%
    ggplot(aes(reorder(characteristic, sort_var), post_mean, colour = hcl_label, shape = hcl_label)) +
    geom_point() +
    scale_shape_manual(values=1:13) +
    geom_errorbar(aes(ymin = post_mean - 1.96 * post_sd, ymax = post_mean + 1.96 * post_sd)) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    labs(y = "Monthly Alpha with 95% Confidence Interval (%)", colour = "Cluster", shape = "Cluster") +
    theme(
      axis.title.x = element_blank(),
      axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)
    )
}

plot_repl_region <- function(eb_all, cluster_order) {
  eb_all$factors %>% 
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig == 1) %>%
    mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>%
    group_by(region, hcl_label) %>% 
    summarise(n = n(), repl_rate = mean(post_mean - 1.96 * post_sd > 0)) %>% 
    group_by(hcl_label) %>%
    mutate(
      sort_var = repl_rate[region == "us"] + n[region == "us"] / 1e6,
      region_pretty = case_when(
        region == "us" ~ "USA",
        region == "developed" ~ "Developed Ex. USA",
        region == "emerging" ~ "Emerging"
      ),
      region_pretty = region_pretty %>% factor(levels = c("USA", "Developed Ex. USA", "Emerging"))
    ) %>%
    ggplot(aes(reorder(hcl_label, sort_var), repl_rate*100, fill = hcl_label)) +
    geom_col() +
    labs(y = "Replication Rate (%)") +
    facet_wrap(~region_pretty, ncol = 1) +
    theme(
      legend.position = "none",
      axis.title.x = element_blank(),
      axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)
    )
}

# Figure 1 - Waterfall Graph
plot_lit_comp <- function(eb_us, mt_res, eb_world, excl_insig=T) {
  if (excl_insig) {
    sig_group <- T
  } else {
    sig_group <- c(T, F)
  }
  raw_reg <- eb_us$input$long %>%
    group_by(characteristic) %>%
    nest() %>%
    mutate(
      raw_reg = data %>% map(~ lm(ret ~ 1, data = .x)),
      tidied = raw_reg %>% map(tidy)
    ) %>%
    unnest(tidied) %>% 
    ungroup() 
  
  raw_overall <- raw_reg %>%
    summarise(repl_rate = mean(p.value < 0.05 & estimate > 0)) %>%
    pull(repl_rate)
  
  raw_sig <- raw_reg %>% 
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig == T) %>%
    summarise(repl_rate = mean(p.value < 0.05 & estimate > 0)) %>%
    pull(repl_rate)
  
  capm <- mt_res %>% 
    filter(region == "us" & method %in% c("BY", "OLS")) %>% 
    mutate(characteristic = char_reg %>% str_remove("__.+")) %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig %in% sig_group) %>%
    group_by(method) %>% 
    summarise(repl_rate = mean(p < 0.05 & estimate > 0))
  
  eb_us_repl <- eb_us$factors %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig %in% sig_group) %>%
    summarise(repl_rate = mean(p025 > 0)) %>%
    pull(repl_rate)
  
  eb_global_repl <- eb_world$factors %>% 
    ungroup() %>% 
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig %in% sig_group) %>%
    summarise(repl_rate = mean(post_mean - 1.96 * post_sd > 0)) %>% 
    pull(repl_rate)
  
  # Waterfall Graph
  litterature_comp <- tribble(
    ~ type, ~repl_rate,
    "hxz", 0.35,
    "raw", raw_overall,
    "raw_sig", raw_sig,
    "alpha", capm %>% filter(method == "OLS") %>% pull(repl_rate),
    "mt", capm %>% filter(method == "BY") %>% pull(repl_rate),
    "eb_us", eb_us_repl,
    "eb_global", eb_global_repl
  ) 
  if (excl_insig == F) {
    litterature_comp <- litterature_comp %>% filter(type != "raw_sig")
  } 
  litterature_comp <- litterature_comp %>%
    mutate(
      repl_rate = repl_rate * 100,
      type = type %>% factor(levels = c("hxz", "raw", "raw_sig", "alpha", 
                                        "mt", "eb_us", "eb_global")),
      prev_repl_rate = dplyr::lag(repl_rate, default = 0),
      impact = if_else(repl_rate > prev_repl_rate, "Increase", "Decrease"),
      impact = impact %>% factor(levels = c("Increase", "Decrease"))
    ) %>%
    setDT() 
  
  w <- 0.3  #use to set width of bars
  l1 <- -3
  inc <- -3 
  col_top <- "black" # colours_theme[2]
  col_bot <- "black" # colours_theme[1]
  type <- litterature_comp$type
  plot <- litterature_comp %>%
    ggplot(aes(xmin = as.integer(type) - w, xmax = as.integer(type) + w, ymin = prev_repl_rate, ymax = repl_rate, fill = impact)) +
    geom_rect() +
    geom_segment(data = litterature_comp[1:(.N - 1)], aes(x = as.integer(type) + w, xend = as.integer(type) + w + 1, y = repl_rate, yend = repl_rate)) +
    scale_x_discrete(limits = type) +
    geom_text(aes(x = as.integer(type), y = repl_rate + if_else(sign(repl_rate - prev_repl_rate)!=0, sign(repl_rate - prev_repl_rate), 1) * 2, label = str_c(formatC(round(repl_rate, 1), digits = 1, format = "f"), "%"))) + 
    scale_fill_manual(values = (c("Increase" = colours_theme[1], "Decrease" = colours_theme[2]))) +
    labs(x = "Implementation", y = "Replication Rate (%)") +
    coord_cartesian(ylim = c(0, 90), expand = FALSE, clip = "off") +
    # HXZ
    annotate(geom = "text", x = "hxz", y = l1, label = "Hou, Xue, and", colour = col_top, fontface = 2) +
    annotate(geom = "text", x = "hxz", y = l1 + inc*1, label = "Zhang (2020)", colour = col_top, fontface = 2) +
    annotate(geom = "text", x = "hxz", y = l1 + inc*2, label = "Raw returns", colour = col_bot, fontface = 1) +
    # Our Raw
    annotate(geom = "text", x = "raw", y = l1, label = "Our sample", colour = col_top, fontface = 2) +
    annotate(geom = "text", x = "raw", y = l1 + inc*1, label = "Raw returns,", colour = col_bot) +
    annotate(geom = "text", x = "raw", y = l1 + inc*2, label = "our methodology", colour = col_bot) +
    # Our Alpha
    annotate(geom = "text", x = "alpha", y = l1, label = "Our sample", colour = col_top, fontface = 2) +
    annotate(geom = "text", x = "alpha", y = l1 + inc*1, label = "CAPM alphas", colour = col_bot) +
    # Our MT
    annotate(geom = "text", x = "mt", y = l1, label = "Harvey, Liu, and", colour = col_top, fontface = 2) +
    annotate(geom = "text", x = "mt", y = l1 + inc*1, label = "Zhu (2016)", colour = col_bot, fontface = 2) +
    annotate(geom = "text", x = "mt", y = l1 + inc*2, label = "Multiple testing", colour = col_bot) +
    annotate(geom = "text", x = "mt", y = l1 + inc*3, label = "adjustment", colour = col_bot) +
    # Our EB-US
    annotate(geom = "text", x = "eb_us", y = l1, label = "Our Bayesian", colour = col_top, fontface = 2) +
    annotate(geom = "text", x = "eb_us", y = l1 + inc*1, label = "estimation", colour = col_top, fontface = 2) +
    annotate(geom = "text", x = "eb_us", y = l1 + inc*2, label = "US data", colour = col_bot) +
    # Our EB-US
    annotate(geom = "text", x = "eb_global", y = l1, label = "Our Bayesian", colour = col_top, fontface = 2) +
    annotate(geom = "text", x = "eb_global", y = l1 + inc*1, label = "estimation", colour = col_top, fontface = 2) +
    annotate(geom = "text", x = "eb_global", y = l1 + inc*2, label = "Global data", colour = col_bot) +
    theme(
      legend.title = element_blank(),
      plot.margin = unit(c(1, 1, 4, 1), "lines"),
      axis.title.x = element_blank(),
      axis.text.x = element_blank(),
      panel.grid.major.x = element_blank(),
      panel.grid.minor.x = element_blank()) 
  
  if (excl_insig) {
    # Our Raw Significant only
    plot <- plot +
      annotate(geom = "text", x = "raw_sig", y = l1, label = "Our sample", colour = col_top, fontface = 2) +
      annotate(geom = "text", x = "raw_sig", y = l1 + inc*1, label = "Excl. factors", colour = col_bot) +
      annotate(geom = "text", x = "raw_sig", y = l1 + inc*2, label = "never significant", colour = col_bot)
  }
  return(plot)
}

plot_many_factors <- function() {
  # The Power of Many Factors
  many_factors_se <- eb_est$us$input$long %>%
    select("char_reg" = name_wide, ret_neu_scaled, mkt_vw_exc) %>%
    mutate(
      region = char_reg %>% str_extract(pattern = "(?<=_{2}).+"),
      region = case_when(
        region == "us" ~ "USA",
        region == "developed" ~ "Developed Ex. USA",
        region == "emerging" ~ "Emerging"
      ),
      region = region %>% factor(levels = c("USA", "Developed Ex. USA", "Emerging"))
    ) %>%
    filter(!is.na(ret_neu_scaled)) %>%
    group_by(char_reg, region) %>%
    nest() %>%
    mutate(
      fit = data %>% map(~lm(ret_neu_scaled ~ mkt_vw_exc, data = .x)),  # They are market neutral by construction but this takes care of the degress of freedom adjustment 
      nw = fit %>% map(~ lmtest::coeftest(.x, vcov = sandwich::NeweyWest(.x)) %>% broom::tidy()),
      df = fit %>% map_dbl(~ .x$df.residual)
    ) %>%
    unnest(nw) %>%
    filter(term == "(Intercept)") %>%
    rename("p_ols" = p.value) %>%
    # group_by(region) %>%
    ungroup() %>%
    mutate(
      n = n(),
      p_bonf = p_ols %>% p.adjust(method = "bonferroni"),
      p_holm = p_ols %>% p.adjust(method = "holm"),
      p_bh = p_ols %>% p.adjust(method = "BH"),
      p_by = p_ols %>% p.adjust(method = "BY")
    ) %>%
    select(n, region, char_reg, estimate, statistic, df, "se" = std.error, starts_with("p_")) %>%
    gather(starts_with("p_"), key = "method", value = "p") %>%
    mutate(
      method = method %>% str_remove("^p_"),
      mt_adj = case_when(
        method == "ols" ~ "None",
        method == "bh" ~ "FDR",
        method == "by" ~ "FDR",
        method == "bonf" ~ "FWR",
        method == "holm" ~ "FWR"
      ),
      method = case_when(
        method == "ols" ~ "OLS",
        method == "bh" ~ "BH",
        method == "by" ~ "Benjamini-Yekutieli",
        method == "bonf" ~ "Bonferroni",
        method == "holm" ~ "Holm",
        TRUE ~ method
      ),
      method = method %>% factor(levels = c("OLS", "Bonferroni", "Holm", "BH", "Benjamini-Yekutieli", "EB - Region", "EB - Full"))
    ) 
  
  (mf_t <- many_factors_se %>% 
      group_by(method) %>%
      summarise(
        t_140 = (min(abs(statistic)[p < 0.05]) + max(abs(statistic)[p > 0.05])) / 2
      ))
  
  avg_se <- mean(many_factors_se$se)
  
  ols_ci <- tibble(
    method = c("OLS", "Bonferroni", "Benjamini-Yekutieli", "Empirical Bayes"), 
    t_1 = rep(1.96, 4)
  ) %>%
    left_join(mf_t, by = "method") %>%
    gather(t_1, t_140, key = "n_factors", value = "t") %>%
    mutate(
      n_factors = n_factors %>% str_remove("t_") %>% as.integer(),
      p025 = -t * avg_se,
      p975 = t * avg_se
    ) %>%
    filter(!(method == "Empirical Bayes" & n_factors == 140)) 
  
  eb_ci <- eb_est$us$factors %>%
    summarise(
      eb_se = mean(post_sd)
    ) %>%
    transmute(
      method = "Empirical Bayes",
      n_factors = 140,
      p025 = -1.96 * eb_se,
      p975 = 1.96 * eb_se
    )  
  
  comb_data <- bind_rows(ols_ci, eb_ci)
  
  comb_data %>%
    ggplot(aes(n_factors, colour = method, linetype = method)) +
    geom_line(aes(y = p025)) +
    geom_line(aes(y = p975)) +
    geom_ribbon(data=comb_data %>% filter(method == "Empirical Bayes"), 
                aes(x = n_factors, ymin=p025,ymax=p975), fill=colours_theme[3], alpha=0.2, inherit.aes = F) +
    labs(x = "Number of Factors", y = "Centered 95% Confidence Interval", colour = "Method", linetype = "Method") +
    scale_y_continuous(breaks=c(avg_se * 1.96, 0, -avg_se * 1.96),
                       labels=c(expression(hat(alpha) + sigma[hat(alpha)] %*% t), expression(hat(alpha)), expression(hat(alpha) - sigma[hat(alpha)] %*% t))) +
    scale_x_continuous(breaks = c(1, 140), expand=expansion(mult = c(0, 0.3), 
                                                            add = c(5, 0))) +
    geom_dl(aes(label = method, y = p975), method = list(dl.trans(x = x + 0.2), "last.points", cex = 1)) +
    theme(
      text = element_text(size = 12),
      axis.title.y = element_blank(),
      legend.position = "none"
    )
}


# Plot In-Sample vs. Out of Sample
plot_is_oos <- function(ub_us, ub_dev, ub_emer) {
  is_oos_split <- ub_us$ols_regs %>% 
    mutate(region = "us") %>%
    bind_rows(
      ub_dev$ols_regs %>% mutate(region = "dev"),
      ub_emer$ols_regs %>% mutate(region = "emer")
    ) %>%
    mutate(
      region = case_when(
        region == "us" ~ "USA",
        region == "dev" ~ "Developed",
        region == "emer" ~ "Emerging"
      ),
      region = region %>% factor(levels = c("USA", "Developed", "Emerging"))
    ) %>%
    rename("characteristic" = c)
  
  cluster_plot <- is_oos_split %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig == 1) %>%
    left_join(cluster_labels, by = "characteristic") %>%
    gather(is, post_sample, key = "period", value = "estimate") %>%
    group_by(region, hcl_label, period) %>%
    summarise(
      alpha_mean = mean(estimate)
    ) %>% 
    select(region, hcl_label, period, alpha_mean) %>%
    spread(key = period, value = alpha_mean) %>%
    mutate(nudge_y = -0.015) %>%
    ggplot(aes(is, post_sample)) +
    geom_point() +
    ggrepel::geom_text_repel(aes(label = hcl_label), nudge_y = -0.015) +
    geom_abline(intercept = 0, slope = 1, linetype = "dotted") +
    # geom_smooth(method = "lm", se = F) +
    facet_wrap(~region) +
    labs(x = "Monthly Alpha (%): In-Sample", y = "Monthly Alpha (%): Post Sample")
  
  is_oos_split %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig == 1) %>%
    ggplot(aes(is, post_sample)) +
    geom_point() +
    geom_abline(intercept = 0, slope = 1, linetype = "dotted") +
    geom_smooth(method = "lm", se = F, formula = "y ~ x") +
    facet_wrap(~region, ncol = 1) +
    labs(x = "Monthly Alpha (%): In-Sample", y = "Monthly Alpha (%): Post Sample")
}


plot_is_oos_factors <- function(is_oos_regions) {
  is_oos_data <- is_oos_regions %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig == 1) %>%
    mutate(
      region = case_when(
        region == "us" ~ "USA",
        region == "developed" ~ "Developed Ex. USA",
        region == "emerging" ~ "Emerging"
      ),
      region = region %>% factor(levels = c("USA", "Developed Ex. USA", "Emerging"))
    ) %>%
    select(region, characteristic, period, estimate) %>%
    spread(key = period, value = estimate) 
  
  is_oos_data %>%
    group_by(region) %>%
    nest() %>%
    mutate(
      fit = data %>% map(~lm(oos ~ is, data = .x)),
      n = fit %>% map_dbl(~length(.x$residuals)),
      tidied = fit %>% map(tidy)
    ) %>%
    unnest(tidied)
  
  is_oos_data %>% 
    ggplot(aes(is, oos)) + 
    geom_point() + 
    geom_smooth(method = "lm", se = F, formula = "y ~ x") +
    facet_wrap(~region) +
    geom_hline(yintercept = 0) +
    geom_vline(xintercept = 0) +
    coord_fixed() +
    labs(x = "Monthly Alpha (%): In-Sample", y = "Monthly Alpha (%): Out-of-Sample")
}

plot_tpf <- function(tpf, cluster_order, ci_low = 0.05, ci_high = 0.95) {
  orig <- tpf %>%
    filter(id == "Apparent") %>%
    select(coef) %>%
    unnest(coef) %>%
    rename("tpf_weight" = weight)
  
  bs <- tpf %>%
    filter(id != "Apparent") %>%
    unnest(coef) %>%
    group_by(term) %>%
    summarise(
      bs_mean = mean(weight),
      bs_sd = sd(weight),
      bs_se = bs_sd / sqrt(n()),
      bs_low = weight %>% quantile(ci_low),
      bs_high = weight %>% quantile(ci_high),
      bs_prob_zero = mean(weight == 0)
    ) %>%
    left_join(orig, by = "term") %>%
    mutate(bs_bias = bs_mean - tpf_weight)
  print(paste0("Clusters with significantly positive TPF weight: ", sum(filter(bs, term != "Market")$bs_low>0)))
  bs %>% 
    mutate(
      term = term %>% factor(levels = c(cluster_order, "Market"))
    ) %>%
    ggplot(aes(reorder(term, tpf_weight), tpf_weight*100, fill = term)) +
    geom_col() +
    coord_flip() +
    geom_errorbar(mapping = aes(ymin = bs_low*100, ymax = bs_high*100), width = 0.2, size = 0.2) +
    labs(y = "Weight in Tangency PF (%)") +
    theme(
      axis.title.y = element_blank(),
      legend.position = "none"
    )
}

plot_tpf_region <- function(tpf_us, tpf_dev, tpf_emer, cluster_order, ci_low = 0.05, ci_high = 0.95) {
  all <- bind_rows(tpf_us, tpf_dev, tpf_emer)
  
  orig <- all %>%
    filter(id == "Apparent") %>%
    select(market_region, coef) %>%
    unnest(coef) %>%
    rename("tpf_weight" = weight)
  
  bs <- all %>%
    filter(id != "Apparent") %>%
    unnest(coef) %>%
    group_by(market_region, term) %>%
    summarise(
      bs_mean = mean(weight),
      bs_sd = sd(weight),
      bs_se = bs_sd / sqrt(n()),
      bs_low = weight %>% quantile(ci_low),
      bs_high = weight %>% quantile(ci_high),
      bs_prob_zero = mean(weight == 0)
    ) %>%
    left_join(orig, by = c("market_region", "term")) %>%
    mutate(bs_bias = bs_mean - tpf_weight)
  
  
  bs %>% 
    group_by(term) %>% 
    mutate(
      sort_var = tpf_weight[market_region == "us"],
      region_pretty = case_when(
        market_region == "us" ~ "USA",
        market_region == "developed" ~ "Developed Ex. USA",
        market_region == "emerging" ~ "Emerging"
      ),
      region_pretty = region_pretty %>% factor(levels = c("USA", "Developed Ex. USA", "Emerging"))
    ) %>%
    mutate(term = term %>% factor(levels = c(cluster_order, "Market"))) %>%
    ggplot(aes(reorder(term, sort_var), tpf_weight*100, fill = term)) +
    geom_col() +
    geom_errorbar(mapping = aes(ymin = bs_low*100, ymax = bs_high*100), width = 0.2, size = 0.2) +
    labs(y = "Weight in Tangency PF (%)") +
    theme(
      axis.title.x = element_blank(),
      axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
      legend.position = "none"
    ) +
    facet_wrap(~region_pretty, ncol = 1)
}

plot_tpf_size <- function(tpf_size_samples, cluster_order, ci_low = 0.05, ci_high = 0.95) {
  orig <- tpf_size_samples %>%
    filter(id == "Apparent") %>%
    select(size_grp, coef) %>%
    unnest(coef) %>%
    rename("tpf_weight" = weight)
  
  bs <- tpf_size_samples %>%
    filter(id != "Apparent") %>%
    unnest(coef) %>%
    group_by(size_grp, term) %>%
    summarise(
      bs_mean = mean(weight),
      bs_sd = sd(weight),
      bs_se = bs_sd / sqrt(n()),
      bs_low = weight %>% quantile(ci_low),
      bs_high = weight %>% quantile(ci_high),
      bs_prob_zero = mean(weight == 0)
    ) %>%
    left_join(orig, by = c("size_grp", "term")) %>%
    mutate(bs_bias = bs_mean - tpf_weight)
  
  
  bs %>% 
    group_by(term) %>% 
    mutate(
      sort_var = tpf_weight[size_grp == "mega"],
      size_grp_pretty = size_grp %>% str_to_title(),
      size_grp_pretty = size_grp_pretty %>% factor(levels = c("Mega", "Large", "Small", "Micro", "Nano"))
    ) %>%
    mutate(term = term %>% factor(levels = c(cluster_order, "Market"))) %>%
    ggplot(aes(reorder(term, sort_var), tpf_weight*100, fill = term)) +
    geom_col() +
    geom_errorbar(mapping = aes(ymin = bs_low*100, ymax = bs_high*100), width = 0.2, size = 0.2) +
    labs(y = "Weight in Tangency PF (%)") +
    theme(
      axis.title.x = element_blank(),
      axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
      legend.position = "none"
    ) +
    facet_wrap(~size_grp_pretty, ncol = 1)
}

plot_over_time <- function(posterior_over_time, orig_sig, ols_incl, lb, bw) {
  if (orig_sig) {
    orig_sig_values <- T
  } else {
    orig_sig_values <- c(T, F)
  }
  
  all_factors <- tibble("char_reg" = rownames(posterior_over_time[[1]]$factor_mean)) %>%
    mutate(characteristic = char_reg %>% str_remove(paste0("__", ot_region))) %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>%
    mutate(
      selected_factors = (orig_sig %in% orig_sig_values) 
    )
  i <- all_factors$selected_factors
  
  full_posterior <- posterior_over_time %>% lapply(function(eb_act) {
    a <- eb_act$factor_mean[i]
    a_cov <- eb_act$factor_cov[i, i]
    n <- length(a)
    w <- rep(1/n, n)
    post_mean <- drop(t(a) %*% w)
    post_sd <- drop(sqrt(t(w) %*% a_cov %*% w))
    avg_ols <- mean(eb_act$factors$ols_est[i])
    tibble("end_date"= eb_act$end_date, n=n, post_mean, post_sd, avg_ols) 
  }) %>% bind_rows()
  
  # Black and white coloring
  if (bw) {
    col1 <- "black"
    col2 <- "grey35"
  } else {
    col1 <- colours_theme[1]
    col2 <- colours_theme[2]
  }
  
  if (ols_incl) {
    # Create OLS benchmarks
    ols_bm <- seq.Date(from = as.Date("1959-12-31"), to = settings$end_date, by = "1 year") %>% lapply(function(end_date) {
      data <- regional_pfs[region == ot_region] %>% 
        filter(characteristic %in% all_factors$characteristic[i]) %>%
        filter(eom >= settings$start_date & eom <= end_date) %>%
        eb_prepare(
          scale_alpha = settings$eb$scale_alpha, 
          overlapping = settings$eb$overlapping 
        )
      
      avg_alpha_full <- data$long %>%
        group_by(characteristic) %>%
        summarise(
          n = n(),
          alpha = mean(ret_neu_scaled)
        ) %>%
        ungroup() %>%
        summarise(
          end_date = end_date,
          type = "avg_alpha_full",
          alpha = mean(alpha)
        )
      
      avg_alpha_st <- data$long %>%
        filter(year(eom) > (year(end_date)-lb)) %>%
        group_by(characteristic) %>%
        mutate(
          ret_neu_st = (ret - cov(ret, mkt_vw_exc)/var(mkt_vw_exc) * mkt_vw_exc)*100,
          ret_neu_st = ret_neu_st / sd(ret_neu_st) * (10 / sqrt(12))
        ) %>%
        summarise(alpha = mean(ret_neu_st)) %>%
        ungroup() %>%
        summarise(
          end_date = end_date,
          type = "avg_alpha_st",
          alpha = mean(alpha)
        )
      
      alpha_avg <- data$long %>%
        group_by(eom) %>%
        summarise(
          n = n(),
          ret = mean(ret_neu_scaled),
          mkt_vw_exc = unique(mkt_vw_exc)
        ) %>%
        ungroup() %>%
        mutate(
          ret_neu = ret - cov(ret, mkt_vw_exc) / var(mkt_vw_exc) * mkt_vw_exc
        ) %>%
        summarise(
          end_date = end_date,
          type = "alpha_avg_full",
          alpha = mean(ret_neu)
        )
      
      bind_rows(avg_alpha_full, avg_alpha_st, alpha_avg)
    }) %>% bind_rows()
    
    ols_bm_wide <- ols_bm %>% spread(key = type, value = alpha)
    
    (plot_1 <- full_posterior %>%
        left_join(ols_bm_wide, by = "end_date") %>%
        ggplot(aes(end_date)) +
        geom_point(aes(y = post_mean, colour="Average Posterior Alpha", shape = "Average Posterior Alpha")) +
        geom_point(aes(y = avg_alpha_full, colour="Average OLS Alpha", shape = "Average OLS Alpha")) +
        geom_errorbar(aes(ymin = post_mean + 1.96 * post_sd, ymax = post_mean - 1.96 * post_sd)) +
        scale_colour_manual(name = "Test", values = c("Average Posterior Alpha"=col1, "Average OLS Alpha"=col2)) +
        scale_shape_manual(name = "Test", values = c("Average Posterior Alpha" = 16, "Average OLS Alpha" = 17)) +
        labs(y = "Posterior Alpha with 95% CI (%)") +
        ylim(c(0, NA)) +
        scale_x_date(breaks = seq.Date(as.Date("1960-12-31"), as.Date("2020-12-31"), by = "10 years"), date_labels = "%Y-%m") +
        theme(
          legend.title = element_blank(),
          legend.position = "top",
          axis.text.x = element_blank(),
          axis.title.x = element_blank()
        ))
    
    plot_2 <- full_posterior %>%
      left_join(ols_bm_wide, by = "end_date") %>%
      ggplot(aes(end_date, avg_alpha_st)) +
      geom_col() +
      labs(y = "5-year Rolling Alpha (%)") +
      scale_x_date(breaks = seq.Date(as.Date("1960-12-31"), as.Date("2020-12-31"), by = "10 years"), date_labels = "%Y-%m") +
      theme(
        axis.title.x = element_blank()
      )
    
    # print(full_posterior %>% left_join(ols_bm_wide, by = "end_date") %>% mutate(diff_pm = post_mean - lag(post_mean), diff_aaf = avg_alpha_full - lag(avg_alpha_full)) %>% filter(end_date != as.Date("1960-12-31")) %>% summarise(sd_pm = sd(diff_pm), sd_aaf = sd(diff_aaf)))
    
    plot <- cowplot::plot_grid(plot_1, plot_2, ncol = 1, rel_heights = c(2, 1))
  } else {
    plot <- full_posterior %>%
      ggplot(aes(end_date)) +
      geom_point(aes(y = post_mean), colour=col1, shape = 16) +
      geom_errorbar(aes(ymin = post_mean + 1.96 * post_sd, ymax = post_mean - 1.96 * post_sd)) +
      labs(y = "Posterior Alpha with 95% CI (%)") +
      ylim(c(0, NA)) +
      scale_x_date(breaks = seq.Date(as.Date("1960-12-31"), as.Date("2020-12-31"), by = "10 years"), date_labels = "%Y-%m") +
      theme(
        legend.title = element_blank(),
        legend.position = "top",
        axis.title.x = element_blank()
      )
  }
  print(full_posterior %>% mutate(ci_width = post_sd*1.96*2) %>% filter(end_date %in% c(as.Date("1960-12-31"), settings$end_date)))
  plot
}



plot_taus_over_time <- function(posterior_over_time_flex) {
  data <- posterior_over_time_flex %>% lapply(function(x) {
    x$mle %>% mutate(end_date = x$end_date)
  }) %>% 
    bind_rows() %>%
    filter(estimate != "alpha") %>%
    mutate(estimate_pretty = if_else(estimate == "tau_s", "tau_w", estimate)) 
  
  ymax <- max(data$ml_est)
  
  data %>%
    ggplot(aes(end_date, ml_est, colour = estimate_pretty, linetype = estimate_pretty)) +
    geom_line() +
    scale_linetype_manual(values = c('tau_c' = "solid", 'tau_w' = "longdash"), name = '', labels = c(expression(tau[c]), expression(tau[w]))) +
    scale_colour_manual(values = c('tau_c' = colours_theme[1], 'tau_w' = colours_theme[2]), name = '', labels = c(expression(tau[c]), expression(tau[w]))) +
    scale_x_date(breaks = seq.Date(as.Date("1960-12-31"), as.Date("2020-12-31"), by = "10 years"), date_labels = "%Y-%m") +
    labs(y = "Maximum Likelihood Estimate (%)") +
    ylim(c(0, ymax)) +
    theme(
      legend.title = element_blank(),
      legend.position = "top",
      axis.title.x = element_blank()
    )
}

plot_sim_fdr <- function(simulation) {
  tau_w_names <- c(
    `0.01` = expression(tau[w] ~ "= 0.01%"),
    `0.2` = expression(tau[w] ~ "= 0.20%")
  )
  stat_labels <- c(
    `False Discovery Rate` = expression(~"False Discovery Rate"),
    `True Discovery Rate` = expression(~"True Discovery Rate"),
    `True Discoveries` = expression(~"True Discoveries"),
    `False Discoveries` = expression(~"False Discoveries")
  )
  
  plot_data <- simulation %>%
    gather(n_disc, fdr, true_disc, false_disc, true_disc_rate, key = "stat", value = "number") %>%
    filter(stat %in% c("fdr", "true_disc", "false_disc", "true_disc_rate")) %>%
    mutate(
      stat = case_when(
        stat == "fdr" ~ "False Discovery Rate",
        stat == "true_disc" ~ "True Discoveries",
        stat == "false_disc" ~ "False Discoveries",
        stat == "true_disc_rate" ~ "True Discovery Rate"
      ),
      stat = stat %>% factor(levels = c("False Discovery Rate", "True Discovery Rate", "True Discoveries", "False Discoveries")),
      type = case_when(
        type == "by" ~ "Benjamini and Yekutieli",
        type == "ols" ~ "OLS",
        type == "eb" ~ "Empirical Bayes"
      ),
      type = type %>% factor(levels = c("OLS", "Benjamini and Yekutieli", "Empirical Bayes")),
      # tau_w_title = formatC(tau_w, digits = 2, format = "f"),
      # tau_w_title = as.character(eval(bquote(tau[w] ~ "=" ~ .(tau_w_title)~ "%")))
      tau_w_title = tau_w %>% factor(labels = tau_w_names),
      stat_title = stat %>% factor(label = stat_labels)
    ) 
  
  fdr_plot <- plot_data %>% 
    filter(stat == "False Discovery Rate") %>%
    ggplot(aes(tau_c, number, colour = type)) +
    geom_point() +
    geom_line() +
    labs(x = "tau_c (%)", y = "False Discovery Rate", colour = "Adjustment") +
    facet_wrap(stat~tau_w_title, labeller = label_bquote(tau[w] ~ "=" ~ .(tau_w_title)~ "%"))
  
  true_disc_rate <- plot_data %>% 
    filter(stat == "True Discovery Rate") %>%
    ggplot(aes(tau_c, number, colour = type)) +
    geom_point() +
    geom_line() +
    labs(x = "tau_c (%)", y = "True Discovery Rate", colour = "Adjustment") +
    facet_wrap(stat~tau_w_title, ncol = 2) 
  
  plot_data %>%
    filter(stat %in% c("False Discovery Rate", "True Discovery Rate")) %>%
    group_by(stat) %>%
    mutate(scale_max = max(number)) %>%
    mutate(scale_min = min(number)) %>%
    ggplot(aes(tau_c, number, colour = type, shape = type)) +
    geom_point() +
    geom_point(aes(y = scale_max), alpha = 0) +
    geom_point(aes(y = scale_min), alpha = 0) +
    geom_line() +
    labs(x = bquote(bold(tau[c])~"(%)"), colour = "Type:", linetype = "Type:", shape = "Type:") +
    facet_wrap(stat_title~tau_w_title, scales = "free_y", labeller = label_parsed) +
    # facet_wrap(stat_title~tau_w_title) +
    # facet_wrap(stat~tau_w_title, scales = "free_y", labeller = label_bquote(tau[w] ~ "=" ~ .(tau_w_title)~ "%")) +
    theme(
      axis.title.y = element_blank(),
      axis.title.x = element_text(size = 12),
      legend.position = "top"
    )
}


plot_size_overall <- function(eb_size, flipped = F, text = F) {
  size_repl <- eb_size %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig == 1) %>%
    group_by(size_grp) %>%
    summarise(
      repl_rate = mean(p025>0)
    )
  if (flipped) {
    size_plot <- size_repl %>%
      mutate(size_grp = size_grp %>% factor(levels = c("Nano", "Micro", "Small", "Large", "Mega"))) %>%
      ggplot(aes(x = size_grp, y = repl_rate*100)) +
      geom_col(fill = colours_theme[1]) +
      coord_flip() +
      labs(y = "Replication Rate (%)") +
      theme(axis.title.y = element_blank())
    if (text) {
      size_plot <- size_plot + geom_text(aes(label = str_c(formatC(round(repl_rate * 100, 1), digits = 1, format = "f"), "%")), nudge_y = 7, size = 5.5)
    }
  } else {
    size_plot <- size_repl %>%
      ggplot(aes(x = size_grp, y = repl_rate*100)) +
      geom_col(fill = colours_theme[1]) +
      labs(y = "Replication Rate (%)") +
      theme(axis.title.x = element_blank())
    if (text) {
      size_plot <- size_plot + geom_text(aes(label = str_c(formatC(round(repl_rate * 100, 1), digits = 1, format = "f"), "%")), nudge_y = 2.5, size = 4)
    }
  }
  size_plot
}

plot_size_clusters <- function(eb_size, cluster_order) {
  overall <- eb_size %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig == 1) %>%
    group_by(size_grp) %>%
    summarise(
      overall_rr = mean(p025>0)
    )
  
  cluster_rr <- eb_size %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig == 1) %>%
    mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>%
    group_by(size_grp, hcl_label) %>%
    summarise(
      repl_rate = mean(p025>0)
    ) %>%
    group_by(hcl_label) %>%
    mutate(sort_var = repl_rate[size_grp == "Mega"]) %>%
    left_join(overall, by = "size_grp") %>%
    mutate(size_title = str_c(size_grp, " - Replication Rate: ", formatC(round(overall_rr * 100, 1), digits = 1, format = "f"), "%"))
  
  titles <- cluster_rr$size_title %>% unique()
  title_order <- c(titles[str_detect(titles, "Mega")], titles[str_detect(titles, "Large")], titles[str_detect(titles, "Small")], 
                   titles[str_detect(titles, "Micro")], titles[str_detect(titles, "Nano")])
  
  cluster_rr %>%
    mutate(size_title = size_title %>% factor(levels = title_order)) %>%
    ggplot(aes(x = reorder(hcl_label, sort_var), y = repl_rate*100, fill = hcl_label)) +
    geom_col() +
    labs(y = "Replication Rate (%)") +
    facet_wrap(~size_title, ncol = 1) +
    theme(
      axis.title.x = element_blank(),
      legend.position = "none",
      axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 10)
    )
}

plot_sign_test <- function(sign_test) {
  sig <- sign_test %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    mutate(
      type = case_when(
        p <= 0.05 & orig_sig == 1 ~ "Replicated",
        p > 0.05 & orig_sig == 1 ~ "Not Replicated",
        orig_sig == 0 ~ "Never Significant"
      ),
      type = type %>% factor(levels = c("Replicated", "Not Replicated", "Never Significant"))      
      # p_value = if_else(p < 0.05, "Significant", "Insignificant"),
      # p_value = p_value %>% factor(levels = c("Significant", "Insignificant"))
    ) 
  
  sig_overall <- sig %>% 
    summarise(repl_rate = sum(type == "Replicated") / sum(type %in% c("Replicated", "Not Replicated"))) %>% 
    pull(repl_rate)
  
  plot_sign_factors <- sig %>%
    ggplot(aes(reorder(characteristic, pos_act), pos_act*100, fill = type)) +
    geom_col() +
    ylim(c(0, 100)) +
    labs(y = "Countries with Positive Return (%)", fill = "Bootstrapped p-Value:") +
    geom_text(aes(x = 18, y = 100, label = str_c("Replication Rate: ", round(sig_overall*100, 2), "%")), inherit.aes = F) +
    # geom_text(aes(label = round(p, 2)), nudge_y = 1, size = 1.5) +
    theme(
      axis.title.x = element_blank(),
      # legend.title = element_blank(),
      legend.position = "top",
      axis.text.x = element_text(size = 7, angle = 90, vjust = 0, hjust = 1),
      text = element_text(size = 10)
    )
  
  plot_pos <- sig %>%
    left_join(cluster_labels, by = "characteristic") %>% 
    mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>%
    group_by(hcl_label) %>%
    filter(type %in% c("Replicated", "Not Replicated")) %>%
    summarise(
      repl_rate = sum(type == "Replicated") / sum(type %in% c("Replicated", "Not Replicated")),
      pos = mean(pos_act)
    ) %>% 
    ggplot(aes(reorder(hcl_label, pos), pos*100, fill = hcl_label)) +
    geom_col() +
    labs(y = "Countries with Positive Alpha (%)") +
    ylim((c(0, 100))) +
    theme(
      legend.position = "none",
      axis.title.y = element_text(size=8),
      axis.title.x = element_blank(),
      axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)
    )
  
  plot_sig <- sig %>%
    left_join(cluster_labels, by = "characteristic") %>% 
    mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>%
    group_by(hcl_label) %>% 
    summarise(repl_rate = sum(type == "Replicated") / sum(type %in% c("Replicated", "Not Replicated"))) %>% 
    ggplot(aes(reorder(hcl_label, repl_rate), repl_rate*100, fill = hcl_label)) +
    geom_col() +
    labs(y = "Sign Test Replication Rate (%)") +
    theme(
      legend.position = "none",
      axis.title.y = element_text(size=8),
      axis.title.x = element_blank(),
      axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)
    )
  plot_sign_clusters <- cowplot::plot_grid(plot_pos, plot_sig, ncol = 1, labels = c("A", "B"), label_y = 1, label_x = 0)
  list("factors" = plot_sign_factors, "clusters" = plot_sign_clusters)
}

# World ex us versus us
plot_int_cor <- function(eb_us, eb_world_ex_us) {
  cor_data <- eb_us$input$long %>%
    bind_rows(eb_world_ex_us$input$long) %>%
    select(characteristic, region, eom, ret) %>%
    spread(key = region, value = ret) %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>%
    filter(!is.na(us) & !is.na(world_ex_us)) %>%
    group_by(characteristic) %>%
    summarise(
      monhts = n(),
      cor = cor(us, world_ex_us)
    ) 
  
  print(cor_data %>%
          pull(cor) %>%
          quantile())
  
  cor_data %>%
    left_join(cluster_labels, by = "characteristic") %>%
    group_by(hcl_label) %>%
    summarise(
      cor_avg = mean(cor)
    ) %>%
    ggplot(aes(reorder(hcl_label, cor_avg), cor_avg)) +
    geom_col(fill = colours_theme[1]) +
    labs(y = "Correlation of US and World ex. US factor (Avg. within Cluster)") +
    coord_flip() +
    theme(
      axis.title.y = element_blank()
    )
}

plot_world_vs_us <- function(eb_us, eb_world_ex_us) {
  cor_data <- eb_us$input$long %>%
    bind_rows(eb_world_ex_us$input$long) %>%
    select(characteristic, region, eom, ret) %>%
    spread(key = region, value = ret) %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>%
    filter(!is.na(us) & !is.na(world_ex_us)) %>%
    group_by(characteristic) %>%
    summarise(
      monhts = n(),
      cor = cor(us, world_ex_us)
    ) 
  
  print(cor_data %>%
          pull(cor) %>%
          quantile())
  
  
  
  region_data <- eb_us$factors %>%
    bind_rows(eb_world_ex_us$factors) %>%
    select(characteristic, region, ols_est) %>%
    spread(key = region, value = ols_est) %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") 
  
  max_scale <- max(c(region_data$world_ex_us, region_data$us))
  min_scale <- min(c(region_data$world_ex_us, region_data$us))
  
  fit_all <- lm(world_ex_us ~ us, data = region_data)
  int <- fit_all$coefficients[1]
  slp <- fit_all$coefficients[2]
  r2 <- summary(fit_all)$r.squared
  
  eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(R)^2~"="~r2, 
                   list(a = format(unname(int), digits = 2),
                        b = format(unname(slp), digits = 2),
                        r2 = format(r2, digits = 3)))
  lbl <- as.character(as.expression(eq))
  
  t_int <- formatC(round(summary(fit_all)$coefficients["(Intercept)", "t value"], 2), format='f', digits=2)
  t_us <- formatC(round(summary(fit_all)$coefficients["us", "t value"], 2), format='f', digits=2)
  
  region_data %>%
    mutate(
      orig_sig_pretty = if_else(orig_sig == 1, "Studied", "Not Studied"),
      orig_sig_pretty = orig_sig_pretty %>% factor(levels = c("Studied", "Not Studied"))
    ) %>%
    ggplot(aes(us, world_ex_us, colour = orig_sig_pretty, shape = orig_sig_pretty)) +
    geom_point() +
    geom_abline(intercept = 0, slope = 1, linetype = "dotted") +
    # geom_abline(intercept = int, slope = slp) +
    xlim(c(min_scale, max_scale)) +
    ylim(c(min_scale, max_scale)) +
    annotate("text", label=lbl, parse=TRUE, x=min_scale, y=max_scale, hjust = 0) +
    annotate("text", label=paste0("        (", t_int, ")   (", t_us, ")"), parse=F, x=min_scale, y=max_scale-0.07, hjust = 0, size = 3.3) +
    labs(x = "US Alpha (%)", y = "World Ex. US Alpha (%)") + 
    theme(
      legend.position = "none",
      legend.title = element_blank()
    )
}

plot_is_oos_post <- function(is_oos, type) { # type in c("GLS", "OLS")
  plot_list <- c("pre", "post", "pre_post") %>% lapply(function(period) {
    data <- is_oos[[period]]$regs
    if (type == "OLS") {
      fit <- lm(oos ~ is, data = data)
      # Output
      int <- unname(fit$coefficients[1])
      int_se <- summary(fit)$coefficients[1, "Std. Error"]
      slope <- unname(fit$coefficients[2])
      slope_se <- summary(fit)$coefficients[2, "Std. Error"]
      r2 <- summary(fit)$adj.r.squared # Adjusted R2
      # Label
      eq_lbl <- substitute(
        italic(y) == a + b %.% italic(x)*","~~italic(R)^2~"="~r2,
        list(
          a = formatC(int, digits = 2, format = "f"),
          b = formatC(slope, digits = 2, format = "f"),
          r2 = formatC(r2, digits = 2, format = "f"))
      )
    }
    if (type == "GLS") {
      x <- cbind(rep(1, nrow(data)), data$is)
      y <- data$oos
      chars <- str_c(data$characteristic, "__us")
      
      gls_cov <- eb_est$us$factor_cov[chars, chars]
      gls_est <- solve(t(x) %*% solve(gls_cov) %*% x) %*% t(x) %*% solve(gls_cov) %*% y
      gls_res <- y - x %*% gls_est
      gls_e_var <- 1/(nrow(x)-ncol(x)) * t(gls_res) %*% solve(gls_cov) %*% (gls_res)
      gls_se <- sqrt(diag(drop(gls_e_var)* solve(t(x) %*% solve(gls_cov) %*% x)))
      # Output
      int <- gls_est[1, 1]
      int_se <- gls_se[1]
      slope <- gls_est[2, 1]
      slope_se <- gls_se[2]
      # Label (R2 doesn't really translate to GLS, because the mean prediction is no longer a good baseline)
      eq_lbl <- substitute(
        italic(y) == a + b %.% italic(x),
        list(
          a = formatC(int, digits = 2, format = "f"),
          b = formatC(slope, digits = 2, format = "f"))
      )
    }
    
    min_y <- min(data$is, data$oos)
    max_y <- max(data$is, data$oos)
    
    t_int <- formatC(round(int/int_se, 2), format='f', digits=2)
    t_is <- formatC(round(slope/slope_se, 2), format='f', digits=2)
    t_lbl <- paste0("       (", t_int, ")   (", t_is, ")")
    
    data %>%
        ggplot(aes(is, oos)) +
        geom_point(colour = colours_theme[1]) +
        ylim(c(min_y, max_y)) +
        xlim(c(min_y, max_y)) +
        geom_hline(yintercept = 0, linetype = "solid") +
        geom_vline(xintercept = 0, linetype = "solid") +
        geom_abline(slope = 1, intercept = 0, linetype = "dotted") +
        ggtitle(label = eq_lbl, subtitle = t_lbl) +
        labs(x = "In-Sample", y = "Out-of-Sample")
  })
  plot_list
}

plot_is_oos_post_quad <- function(is_oos, type) { # type in c("OLS", "GLS")
  plot_list <- c("pre", "post", "pre_post") %>% lapply(function(period) {
    data <- is_oos[[period]]$regs
    if (type == "OLS") {
      fit <- lm(oos ~ is + I(is^2), data = data)
      # Output
      int <- unname(fit$coefficients[1])
      int_se <- summary(fit)$coefficients[1, "Std. Error"]
      is <- unname(fit$coefficients[2])
      is_se <- summary(fit)$coefficients[2, "Std. Error"]
      issq <- unname(fit$coefficients[3])
      issq_se <- summary(fit)$coefficients[3, "Std. Error"]
      r2 <- summary(fit)$adj.r.squared # Adjusted R2
      # Label 
      eq_lbl <- substitute(
        italic(y) == a + b %.% italic(x)* ~ s ~ c %.% italic(x)^2*","~~italic(R)^2~"="~r2,
        list(
          a = formatC(int, digits = 2, format = "f"),
          b = formatC(is, digits = 2, format = "f"),
          s = ifelse(sign(issq)==1, "+", "-"),
          c = formatC(unname(abs(issq)), digits = 2, format = "f"),
          r2 = formatC(r2, digits = 2, format = "f"))
      )
    }
    if (type == "GLS") {
      x <- cbind(rep(1, nrow(data)), data$is, data$is^2)
      y <- data$oos
      chars <- str_c(data$characteristic, "__us")
      
      gls_cov <- eb_est$us$factor_cov[chars, chars]
      gls_est <- solve(t(x) %*% solve(gls_cov) %*% x) %*% t(x) %*% solve(gls_cov) %*% y
      gls_res <- y - x %*% gls_est
      gls_e_var <- 1/(nrow(x)-ncol(x)) * t(gls_res) %*% solve(gls_cov) %*% (gls_res)
      gls_se <- sqrt(diag(drop(gls_e_var)* solve(t(x) %*% solve(gls_cov) %*% x)))
      # Output
      int <- gls_est[1, 1]
      int_se <- gls_se[1]
      is <- gls_est[2, 1]
      is_se <- gls_se[2]
      issq <- gls_est[3, 1]
      issq_se <- gls_se[3]
      # Label
      eq_lbl <- substitute(
        italic(y) == a + b %.% italic(x)* ~ s ~ c %.% italic(x)^2,
        list(
          a = formatC(int, digits = 2, format = "f"),
          b = formatC(is, digits = 2, format = "f"),
          s = ifelse(sign(issq)==1, "+", "-"),
          c = formatC(unname(abs(issq)), digits = 2, format = "f"))
      )
    }
    
    min_y <- min(data$is, data$oos)
    max_y <- max(data$is, data$oos)
    
    t_int <- formatC(round(int/int_se, 2), format='f', digits=2)
    t_is <- formatC(round(is/is_se, 2), format='f', digits=2)
    t_issq <- formatC(round(issq/issq_se, 2), format='f', digits=2)
    t_lbl <- paste0("      (", t_int, ")    (", t_is, ")         (", t_issq, ")")
    
    data %>%
      ggplot(aes(is, oos)) +
      geom_point(colour = colours_theme[1]) +
      ylim(c(min_y, max_y)) +
      xlim(c(min_y, max_y)) +
      geom_hline(yintercept = 0, linetype = "solid") +
      geom_vline(xintercept = 0, linetype = "solid") +
      geom_abline(slope = 1, intercept = 0, linetype = "dotted") +
      geom_smooth(method = "loess", span = 1, formula = "y~x") +
      ggtitle(label = eq_lbl, subtitle = t_lbl) +
      labs(x = "In-Sample", y = "Out-of-Sample")
  })
  plot_list
}

# Effect Size Plot
plot_effects <- function(type, orig_sig, cluster_order) {  # type in c("ols", "eb")
  if (orig_sig) {
    orig_sig_values <- T
  } else {
    orig_sig_values <- c(T, F)
  }
  if (type == "ols") {
    alpha_est = "ols_est"
  }
  if (type == "eb") {
    alpha_est = "post_mean"
  }
  (effect_world <- eb_est$world$factors %>%
      left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>%
      filter(orig_sig %in% orig_sig_values) %>%
      mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>%
      group_by(hcl_label) %>%
      summarise(mean_alpha = mean(get(alpha_est))) %>%
      ggplot(aes(reorder(hcl_label, mean_alpha), mean_alpha, fill = hcl_label)) + 
      geom_col() +
      coord_flip() +
      labs(y = paste(str_to_upper(type), "Alpha Estimate (%)")) +
      theme(
        axis.title.y = element_blank(),
        legend.position = "none"
      ))
  
  (effect_regions <- eb_est$all$factors %>%
      left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>%
      filter(orig_sig %in% orig_sig_values) %>%
      mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>%
      group_by(region, hcl_label) %>%
      summarise(mean_alpha = mean(get(alpha_est))) %>%
      group_by(hcl_label) %>%
      mutate(
        sort_var = mean_alpha[region == "us"],
        region_pretty = case_when(
          region == "us" ~ "US",
          region == "developed" ~ "Developed",
          region == "emerging" ~ "Emerging"
        ),
        region_pretty = region_pretty %>% factor(levels = c("US", "Developed", "Emerging"))
      ) %>%
      ggplot(aes(reorder(hcl_label, sort_var), mean_alpha, fill = hcl_label)) + 
      geom_col() +
      coord_flip() +
      scale_y_continuous(breaks = seq(-0.2, 1, 0.2)) +
      facet_wrap(~region_pretty, scales = "free_x") +
      labs(y = paste("Average", str_to_upper(type) ,"Alpha (%)")) +
      theme(
        axis.title.y = element_blank(),
        legend.position = "none"
      ))
  
  (effect_size <- eb_us_size %>%
      left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>%
      filter(orig_sig %in% orig_sig_values) %>%
      mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>%
      group_by(size_grp, hcl_label) %>%
      summarise(mean_alpha = mean(get(alpha_est))) %>%
      group_by(hcl_label) %>%
      mutate(sort_var = mean_alpha[size_grp == "Mega"]) %>%
      ggplot(aes(reorder(hcl_label, sort_var), mean_alpha, fill = hcl_label)) +
      geom_col() +
      coord_flip() +
      scale_y_continuous(breaks = seq(0, 1.5, 0.50)) +
      facet_wrap(~size_grp, nrow = 1, scales = "free_x") +
      labs(y = paste("Average", str_to_upper(type) ,"Alpha (%)")) +
      theme(
        axis.title.y = element_blank(),
        legend.position = "none"
      ))
  
  list(effect_world, effect_regions, effect_size)
}

# Replication Rate by Cluster
plot_repl_cluster <- function(eb_factors, orig_sig, cluster_order) {
  if (orig_sig) {
    factor_subset <- eb_factors %>%
      left_join(char_info %>% select(characteristic, significance), by = "characteristic") %>%
      filter(significance == T)
  } else {
    factor_subset <- copy(eb_factors)
  }
  if (uniqueN(factor_subset$region) != 1) {
    warning("!!!MULTIPLE REGIONS INCLUDED!!!")
  }
  
  factor_subset %>%
    mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>%
    group_by(hcl_label) %>%
    summarise(
      n = n(),
      repl_rate = mean(p025 > 0),
      sort_var = repl_rate + n / 100000
    )  %>%
    ggplot(aes(reorder(hcl_label, sort_var), repl_rate * 100, fill = hcl_label)) +
    geom_col() +
    labs(y = "Replication Rate (%)") +
    coord_flip() +
    theme(
      axis.title.y = element_blank(),
      legend.position = "none"
    )
}

# Plot Replication Rate as a Function of Tau - Benchmark against Harvey et al. (2016)
plot_harvey <- function(harvey_base_res, harvey_worst_res, tau_ws, act_rr) {
  mle_est_base <- harvey_base_res$sim[[1]] %>% 
    lapply(function(x) x$mle) %>% 
    bind_rows() %>%
    mutate(type = "baseline")
  
  mle_est_worst <- harvey_worst_res$sim[[1]] %>% 
    lapply(function(x) x$mle) %>% 
    bind_rows() %>%
    mutate(type = "worst_case")
  
  mle_est <- mle_est_base %>% bind_rows(mle_est_worst)
  
  mle_summary <- mle_est %>%
    group_by(type, coef) %>%
    summarise(
      n = n(),
      coef_mean = mean(mle)
    )
  
  tc_harvey_base <- mle_summary %>% filter(type == "baseline" & coef == "tc") %>% pull(coef_mean)
  tc_harvey_worst <- mle_summary %>% filter(type == "worst_case" & coef == "tc") %>% pull(coef_mean)
  
  # Replication Rate under alternative Tau's
  m <- eb_est$us$factors %>%
    select(characteristic, ols_est, hcl_label) %>%
    mutate(cm = 1) %>%
    select(characteristic, hcl_label, cm) %>%
    spread(key = hcl_label, value = cm) %>% 
    select(-characteristic) %>% 
    as.matrix()
  m[is.na(m)] <- 0
  mm <- m %*% t(m)
  
  alpha_hat <- eb_est$us$factors %>% pull(ols_est)
  alpha_0_vec <- rep(0, 153)
  sigma <- eb_est$us$sigma
  
  tc_act <- eb_est$us$mle %>% filter(estimate == "tau_c") %>% pull(ml_est) %>% round(2)
  
  search_grid <- expand.grid(
    tau_c = c(seq(0.15, 0.46, by = 0.01), tc_act, tc_harvey_base, tc_harvey_worst),
    tau_w = tau_ws
  )
  
  repl_by_tau <- 1:nrow(search_grid) %>% lapply(function(i) {
    tw <- search_grid[i, "tau_w"]
    tc <- search_grid[i, "tau_c"]
    omega <- diag(153) * tw^2 + mm * tc^2
    post_cov <- solve(solve(omega) + solve(sigma))
    post_alpha <- post_cov %*% (solve(omega) %*% alpha_0_vec + solve(sigma) %*% alpha_hat)
    
    tibble(characteristic = eb_est$us$factors$characteristic, post_mean = drop(post_alpha), post_sd = sqrt(diag(post_cov))) %>%
      left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>%
      filter(orig_sig == T) %>%
      summarise(repl_rate = mean(post_mean - 1.96 * post_sd > 0)) %>%
      mutate(tau_c = tc, tau_w = tw)
  }) %>% bind_rows()
  
  # Set TW labels 
  n_tw <- length(tau_ws) 
  tau_w_names <- vector(mode = "expression", length = n_tw)
  for (i in 1:n_tw) {
    tau_w_names[i] <- c(bquote(bold(tau[w])  ~ "=" ~ .(unname(tau_ws[i])) ~ "%"))
  }
  names(tau_w_names) <- tau_ws
  
  # Generate Important Points
  tau_points <- repl_by_tau %>%
    filter(tau_c %in% c(tc_act, tc_harvey_base, tc_harvey_worst)) %>%
    distinct() %>%
    mutate(
      tau_w_title = tau_w %>% factor(labels = tau_w_names),
      type = case_when(
        tau_c == tc_act ~ "Estimated from Data",
        tau_c == tc_harvey_base ~ "Harvey, Liu, and Zhu (2016): Baseline",
        tau_c == tc_harvey_worst ~ "Harvey, Liu, and Zhu (2016): Conservative",
        TRUE ~ "Other"
      ),
      type = type %>% factor(levels = c("Harvey, Liu, and Zhu (2016): Conservative", "Harvey, Liu, and Zhu (2016): Baseline", "Estimated from Data"))
    )
  print(tau_points)
  
  plot <- repl_by_tau %>%
    mutate(
      tau_w_title = tau_w %>% factor(labels = tau_w_names),
      type = case_when(
        tau_c == tc_act ~ "Estimated from Data",
        tau_c == tc_harvey_base ~ "Harvey, Liu, and Zhu (2016): Baseline",
        tau_c == tc_harvey_worst ~ "Harvey, Liu, and Zhu (2016): Conservative",
        TRUE ~ " "
      ),
      type = type %>% factor(levels = c("Estimated from Data", "Harvey, Liu, and Zhu (2016): Baseline", "Harvey et al. (2016): Conservative", " "))
    ) %>%
    ggplot(aes(tau_c, repl_rate * 100)) +
    geom_point(data = tau_points, aes(colour = type, shape = type, stroke = 1), size = 3) +
    geom_line(alpha = 1, size = 0.6) +
    geom_hline(yintercept = act_rr*100, linetype = "dotted") +
    scale_x_continuous(breaks = seq(0.05, max(search_grid$tau_c), 0.05)) +
    ylim(c(0, 100)) +
    theme(legend.title = element_blank(), legend.position = "top") +
    labs(y = "Replication Rate (%)", x = bquote(bold(tau[c])~"(%)"), colour = expression(tau[c]), shape = expression(tau[c]))
  if (n_tw > 1) {
    plot <- plot + facet_wrap(~tau_w_title, labeller = label_parsed)
  }
  return(plot)
}

# Single Factor TPF --
# Plot TPF Factor: Cluster + Market
plot_tpf_one_cluster <- function(data_wide, cluster_labels, s) {
  mkt_sr <- mean(data_wide$market)/sd(data_wide$market)
  one_cluster <- unique(cluster_labels$hcl_label) %>% lapply(function(c) {
    cl_chars <- cluster_labels %>%
      filter(characteristic %in% colnames(data_wide) & hcl_label == c) %>%
      pull(characteristic)
    # Cluster SR
    cl <- data_wide %>% select(market, all_of(cl_chars))
    w <- cl %>% epo_tpf(s = s)  
    sr_all <- cl %>% sr_func(w = w)
    # Average SR
    sr_single_avg <- cl_chars %>% sapply(function(char) {
      cl_sub <- data_wide %>% select(market, all_of(char))
      w <- cl_sub %>% epo_tpf(s = opt_s)
      cl_sub %>% sr_func(w = w)
    }) %>%
      mean()
    
    tibble(hcl_label=c, sr_all=sr_all, sr_single_avg=sr_single_avg)
  }) %>%
    bind_rows()
  
  one_cluster %>%
    mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>%
    ggplot(aes(reorder(hcl_label, sr_all), sr_all, fill = hcl_label)) +
    geom_col() +
    coord_flip() +
    geom_hline(yintercept = mkt_sr, linetype = "dashed") +
    theme(legend.position = "none", axis.title.y = element_blank()) +
    labs(y = "Monthly Sharpe Ratio: Market + Cluster")
}

# Plot TPF Factor: Exclude one cluster
plot_tpf_excl_cl <- function(data_wide, cluster_labels, s) {
  epo_w <- data_wide %>% epo_tpf(s = s)
  full_sr <- data_wide %>% sr_func(w = epo_w)
  
  excl_one <- c("Market", unique(cluster_labels$hcl_label)) %>% lapply(function(c) {
    cl_chars <- cluster_labels %>%
      filter(characteristic %in% colnames(data_wide) & hcl_label != c) %>%
      pull(characteristic)
    if (c != "Market") {
      cl_chars <- c(cl_chars, "market")
    }
    # All minus Cluster SR
    data <- data_wide %>% select(all_of(cl_chars))
    w <- data %>% epo_tpf(s = s)  
    sr <- data %>% sr_func(w = w)
    
    tibble(hcl_label=c, sr=sr)
  }) %>%
    bind_rows()
  
  excl_one %>%
    mutate(hcl_label = hcl_label %>% factor(levels = c(cluster_order, "Market"))) %>%
    ggplot(aes(reorder(hcl_label, -sr), (full_sr-sr) / full_sr, fill = hcl_label)) +
    geom_col() +
    coord_flip() +
    theme(legend.position = "none", axis.title.y = element_blank()) +
    labs(y = "Percentage Drop in Monthly SR from Excluding Cluster")
}

# Plot TPF Factor: Single Factor Importance
plot_tpf_factor_imp <- function(data_wide, cluster_labels, s) {
  epo_w <- data_wide %>% epo_tpf(s = s)
  full_sr <- data_wide %>% sr_func(w = epo_w)
  each_factor <- colnames(data_wide) %>% lapply(function(c) {
    # SR excluding char
    data <- data_wide %>% select(-all_of(c))
    w <- data %>% epo_tpf(s = opt_s)  
    sr <- data %>% sr_func(w = w)
    # Output
    tibble(characteristic = c, sr = sr)
  }) %>%
    bind_rows()
  
  each_factor %>%
    left_join(cluster_labels, by = "characteristic") %>%
    mutate(
      hcl_label = if_else(characteristic == "market", "Market", hcl_label),
      hcl_label = hcl_label %>% factor(levels = c(cluster_order, "Market")),
      drop = full_sr - sr,
      drop_prop = drop/full_sr
    ) %>%
    arrange(drop_prop) %>%
    tail(10) %>%
    ggplot(aes(reorder(characteristic, drop_prop), drop_prop*100, fill = hcl_label)) +
    geom_col() +
    coord_flip() +
    theme(axis.title.y = element_blank()) +
    labs(y = "Drop in TPF SR (% of full)", fill = "Cluster")
  
  each_factor %>%
    left_join(cluster_labels, by = "characteristic") %>%
    mutate(
      hcl_label = if_else(characteristic == "market", "Market", hcl_label),
      hcl_label = hcl_label %>% factor(levels = c(cluster_order, "Market")),
      drop = full_sr - sr,
      drop_prop = drop/full_sr,
      rank = frank(-drop_prop)
    ) %>%
    group_by(hcl_label) %>%
    filter(characteristic != "market") %>%
    # filter(drop_prop == max(drop_prop)) %>%
    ggplot(aes(reorder(characteristic, drop_prop), drop_prop*100, fill = hcl_label)) +
    geom_col() +
    coord_flip() +
    theme(axis.title.y = element_blank(), legend.position = "none") +
    labs(y = "Drop in TPF SR (% of full)", fill = "Cluster")+
    facet_wrap(~hcl_label, scales = "free_y")
}

# Plot TPF Factor: Single Factor Importance within Cluster
plot_tpf_factor_imp_cluster <- function(data_wide, cluster_labels, s) {
  within_cluster <- unique(cluster_labels$hcl_label) %>% lapply(function(c) {
    cl_chars <- cluster_labels %>%
      filter(characteristic %in% colnames(data_wide) & hcl_label == c) %>%
      pull(characteristic)
    # Full Cluster tpf
    data <- data_wide %>% select(market, all_of(cl_chars))
    w_all <- data %>% epo_tpf(s = opt_s)  
    sr_all <- data %>% sr_func(w = w_all)
    # Individual
    sr_chars <- cl_chars %>% lapply(function(char) {
      sub <- data %>% select(-all_of(char))
      w_sub <- sub %>% epo_tpf(s = s)  
      sr_sub <- sub %>% sr_func(w = w_sub)
      tibble(excl_char = char, sr = sr_sub)
    }) %>% bind_rows()
    
    sr_chars %>% mutate(hcl_label = c, sr_all = sr_all)
  }) %>%
    bind_rows()
  
  within_cluster %>%
      mutate(
        drop = sr_all - sr,
        drop_prop = drop/sr_all,
        hcl_label = hcl_label %>% factor(levels = cluster_order)
      ) %>%
      group_by(hcl_label) %>%
      # filter(drop == max(drop)) %>%
      ggplot(aes(reorder(excl_char, drop_prop), drop_prop*100, fill = hcl_label)) +
      geom_col() +
      coord_flip() +
      theme(axis.title.y = element_blank(), legend.position = "none") +
      labs(y = "Drop in cluster TPF SR (% of full cluster)", fill = "Cluster") +
      facet_wrap(~hcl_label, scales = "free")
}

# Plot TPF Factor: The Evolution of the TPF
plot_tpf_evolution <- function(data, data_wide, char_info, orig_sig_values, s) {
  mkt_sr <- mean(data_wide$market) / sd(data_wide$market)
  
  years <- data %>% 
    filter(year(eom) > min(char_info$sample_end)) %>% 
    mutate(year = year(eom)) %>%
    pull(year) %>%
    unique()
  
  sr_over_time <- years %>% lapply(function(y) {
    discovered_chars <- char_info %>% filter(sample_end <= y & significance %in% orig_sig_values) %>% pull(characteristic)
    sub <- data_wide %>% select(all_of(discovered_chars), "market")  
    w <- sub %>% epo_tpf(s = s) 
    # tibble(w = w, char = colnames(sub)) %>% left_join(char_info %>% select("char"=characteristic, sample_end), by = "char") %>% arrange(-w)
    # 2002: seasonality coincides with noa_at that also gets a large weight
    
    tibble(year = y, tpf_sr = sr_func(sub, w), n = ncol(sub))  
  }) %>% bind_rows()
  
  sr_over_time <- sr_over_time %>%
    bind_rows(tibble(year = min(years)-1, tpf_sr = mkt_sr, n = 0))
  
  (sr_plot <- sr_over_time %>%
      ggplot(aes(year, tpf_sr)) +
      geom_point() +
      geom_line() +
      ylim(c(0, NA)) +
      # geom_hline(yintercept = full_sr) +
      annotate("text", x = 1971, 
               y = 0, label = "Market", colour='black') +
      geom_segment(aes(x = 1971, y = 0.02, xend = 1971, yend = 0.1), size=0.1,arrow = arrow(length = unit(0.2, "cm"))) +
      labs(y = "Ex-Post Tangency SR", x = "Year") +
      theme(axis.title.x = element_blank()) +
      annotate("text", x = 1972, 
               y = 0.34, label = "Beta", colour='black') +
      geom_segment(aes(x = 1972, y = 0.31, xend = 1972, yend = 0.23), size=0.1,arrow = arrow(length = unit(0.2, "cm"))) +
      annotate("text", x = 1979, 
               y = 0.45, label = "Earning-to-Price", colour='black') +
      geom_segment(aes(x = 1979, y = 0.42, xend = 1979, yend = 0.30), size=0.1,arrow = arrow(length = unit(0.2, "cm"))) +
      annotate("text", x = 1983, 
               y = 0.08, label = "Earnings Momentum", colour='black') +
      geom_segment(aes(x = 1981, y = 0.10, xend = 1981, yend = 0.25), size=0.1,arrow = arrow(length = unit(0.2, "cm"))) +
      annotate("text", x = 1989, 
               y = 0.25, label = "Price Momentum", colour='black') +
      geom_segment(aes(x = 1989, y = 0.27, xend = 1989, yend = 0.39), size=0.1,arrow = arrow(length = unit(0.2, "cm"))) +
      annotate("text", x = 1991, 
               y = 0.73, label = "Operating Accruals", colour='black') +
      geom_segment(aes(x = 1991, y = 0.69, xend = 1991, yend = 0.6), size=0.1,arrow = arrow(length = unit(0.2, "cm"))) +
      annotate("text", x = 2002, 
               y = 0.9, label = "Seasonality", colour='black') +
      geom_segment(aes(x = 2002, y = 0.87, xend = 2002, yend = 0.78), size=0.1,arrow = arrow(length = unit(0.2, "cm"))))
  
  n_plot <- sr_over_time %>%
    ggplot(aes(year, n)) +
    geom_point() +
    geom_line() +
    labs(y = "Factor Discovered", x = "Year of Discovery")
  
  list("plot"=cowplot::plot_grid(sr_plot, n_plot, ncol = 1, rel_heights = c(0.7, 0.3)), "data"=sr_over_time)
}

# Plot performance over time
plot_ts <- function(data, oos, alphas, scale, orig_sig, start = as.Date("1986-01-01")) {
  data[, region := case_when(
    region == "us" ~ "US",
    region == "world_ex_us" ~ "World ex. US"
  )]
  data <- data[eom >= start]
  data <- cluster_labels[data, on = "characteristic"]
  if (oos) {
    data <- setDT(char_info)[, .(characteristic, sample_end)][data, on = .(characteristic)]
    data <- data[year(eom) > sample_end][, sample_end := NULL]
  }
  if (orig_sig) {
    data <- setDT(char_info)[, .(characteristic, significance)][data, on = .(characteristic)]
    data <- data[significance==T][, significance := NULL]
  }
  y_axis <- paste0("Cumulative ", if_else(alphas==T, "Alpha ", "Excess Return "), if_else(oos==T, "(OOS)", "(IS)"))
  
  agg <- data[, .(ret = mean(ret), mkt = mean(mkt_vw_exc)), by = .(region, eom)]
  if (alphas) {
    agg[, ret := ret - cov(ret,mkt)/var(mkt)*mkt, by = .(region)]
  }
  if (scale) {
    agg[, ret := ret / (sd(ret)*sqrt(12)/0.1), by = .(region)]
  }
  agg %>% setorder(region, eom)
  agg[, cumret_app := cumsum(ret), by = region]
  plot <- agg %>%
    ggplot(aes(eom, cumret_app, colour = region)) +
    geom_line() +
    labs(y = y_axis) +
    theme(
      legend.position = c(0.85, 0.35),
      legend.title = element_blank(),
      axis.title.x = element_blank()
    )
  # Table
  tbl <- agg %>%
    group_by(region) %>%
    summarise(
      n = n(),
      meanret = mean(ret),
      vol = sd(ret),
      ret_vol = meanret/vol*sqrt(12),
      t = meanret/(vol/sqrt(n))
    ) %>%
    mutate(meanret = meanret*12)
  tbl %>% 
    select(region, ret_vol, t) %>%
    pivot_longer(c(ret_vol, t)) %>%
    mutate(
      value = formatC(value, digits=2, format = "f"),
      value = if_else(name == "t", paste0("(", value, ")"), value)
    ) %>%
    mutate(
      region = if_else(name == "t", "", region)
    ) %>%
    select(-name) %>%
    rename("Region"=region, "Full sample"=value) %>%
    xtable(align = "llc") %>%
    print(include.rownames = F)
  # Output
  return(plot)
}

# Plot OOS performance of significant factors
plot_sig_oos <- function(sig_oos_pfs, sig_type, cutoff_2012, first_date, leg_pos) {
  full <- sig_oos_pfs %>%
    filter(eom >= first_date) %>% 
    group_by(region, type, significant) %>%
    mutate(
      a = ret - cov(mkt,ret)/var(mkt)*mkt
    )  %>%
    summarise(
      n = n(),
      meanret = mean(ret),
      sd = sd(ret),
      sr = meanret/sd * sqrt(12),
      alpha = mean(a),
      resvol = sd(a),
      ir = alpha/resvol*sqrt(12),
      t_alpha = alpha/(resvol/sqrt(n))
    ) %>%
    mutate(alpha = alpha*12) %>%
    filter(region %in% c("us", "world_ex_us") & type == sig_type) %>%
    setDT()
  
  post_harvey <- sig_oos_pfs %>%
    filter(eom >= first_date) %>%
    group_by(region, type, significant) %>%
    mutate(
      a = ret - cov(mkt,ret)/var(mkt)*mkt 
    )  %>%
    filter(eom >= as.Date("2013-01-01")) %>%
    summarise(
      n = n(),
      meanret = mean(ret),
      sd = sd(ret),
      sr = meanret/sd * sqrt(12),
      alpha = mean(a),
      resvol = sd(a),
      ir = alpha/resvol*sqrt(12),
      t_alpha = alpha/(resvol/sqrt(n))
    ) %>%
    mutate(alpha = alpha*12) %>%
    filter(region %in% c("us", "world_ex_us") & type == sig_type) %>%
    setDT()
  
  cumret <- sig_oos_pfs %>%
    filter(eom >= first_date) %>%
    group_by(region, type, significant) %>%
    arrange(region, type, significant, eom) %>%
    filter(type == sig_type & region %in% c("us", "world_ex_us")) %>%
    mutate(
      alpha = ret - cov(mkt,ret)/var(mkt)*mkt,
      alpha = alpha / (sd(alpha)*sqrt(12)/0.1),
      cum_alpha = cumsum(alpha),
      region_pretty = case_when(
        region == "us" ~ "U.S.",
        region == "world_ex_us" ~ "World ex. U.S."
      )
    )
  
  # Figure 
  sig_oos <- cumret %>%
      ggplot(aes(eom, cum_alpha, colour = region_pretty, linetype = region_pretty)) +
      geom_line() +
      labs(y = "Cumulative Alpha") +
      theme(
        axis.title.x = element_blank(),
        legend.position = leg_pos,
        legend.title = element_blank()
      )
  
  # Table for caption
  tbl <- rbind(
    full %>% select(region, ir, t_alpha) %>% mutate(sample = "Full sample"),
    post_harvey %>% select(region, ir, t_alpha) %>% mutate(sample = "Post Harvey et al")
  ) %>%
    pivot_longer(c(ir, t_alpha)) %>%
    mutate(
      value = formatC(value, digits=2, format = "f"),
      value = if_else(name == "t_alpha", paste0("(", value, ")"), value)
    ) %>%
    pivot_wider(names_from = sample, values_from = value) %>%
    mutate(
      region = case_when(
        region=="us" ~ "IR: US",
        region == "world_ex_us" ~ "IR: World ex. US"
      ),
      region = if_else(name == "t_alpha", "", region)
    ) %>%
    select(-name) %>%
    rename("Region"=region)
  
  if (cutoff_2012) {
    sig_oos <- sig_oos + geom_vline(xintercept = as.Date("2012-12-31"), linetype = "dotted", alpha = 1)
    tbl %>% xtable(align = "llcc") %>% print(include.rownames = F)
  } else {
    tbl %>% select(-`Post Harvey et al`) %>% xtable(align = "llc") %>% print(include.rownames = F)
  }
  # Output 
  return(sig_oos)
}


# EB Posterior checks
eb_plots <- function(eb, plot = "shrinkage") {
  if (plot == "cluster_distribution") {
    a <- eb$mle %>% filter(estimate == "alpha") %>% pull(ml_est)
    tb <- eb$mle %>% filter(estimate == "tau_bar") %>% pull(ml_est)
    
    op <- data.frame(x=c(a-3*tb, a+3*tb)) %>% # data.frame(x=c(-1.2, 1.2)) %>%
      ggplot(aes(x)) + 
      stat_function(fun=function(x) dnorm(x = x, mean = a, sd = tb)) +
      labs(x = "Alpha", y = "Density", title = "Population Cluster Distribution ~ N(a0, tau_bar)")
  }
  if (plot == "factor_distribution") {
    a <- eb$mle %>% filter(estimate == "alpha") %>% pull(ml_est)
    tb <- eb$mle %>% filter(estimate == "tau_bar") %>% pull(ml_est)
    tt <- eb$mle %>% filter(estimate == "tau_tilde") %>% pull(ml_est)
    factor_sd <- sqrt(tb^2 + tt^2)
    
    op <- data.frame(x=c(a-3*factor_sd, a+3*factor_sd)) %>% # data.frame(x=c(-1.2, 1.2)) %>%
      ggplot(aes(x)) + 
      stat_function(fun=function(x) dnorm(x = x, mean = a, sd = factor_sd)) +
      labs(x = "Alpha", y = "Density", title = "Population Factor Distribution ~ N(a0, tau_bar + tau_tilde)")
  }
  if (plot == "factor") {
    op <- eb$factors %>%
      ggplot(aes(reorder(characteristic, post_mean), post_mean)) +
      geom_point() +
      geom_errorbar(aes(ymin = p025, ymax = p975)) +
      facet_wrap(~region) +
      theme(
        axis.title.x = element_blank(),
        axis.text.x = element_text(size = 5, angle = 90, vjust = 0, hjust = 1),
        text = element_text(size = 10)
      )
  }
  if (plot == "cluster") {
    op <- eb$clusters %>% 
      ggplot(aes(reorder(hcl_label, post_mean), post_mean)) +
      geom_point() +
      geom_errorbar(aes(ymin = post_mean - 1.96 * post_sd, ymax = post_mean + 1.96 * post_sd)) +
      geom_hline(yintercept = 0, linetype = "dotted") +
      labs(y = "Posterior Distribution of Cluster Alpha") +
      theme(
        axis.title.x = element_blank(),
        axis.text.x = element_text(size = 8)
      ) 
  }
  if (plot == "signal") {
    op <- eb$signal %>%
      ggplot(aes(characteristic, post_mean)) +
      geom_point() +
      geom_errorbar(aes(ymin = post_mean - 1.96 * post_sd, ymax = post_mean + 1.96 * post_sd))
  }
  if (plot == "factor_shrinkage") {
    op <- eb$factors %>%
      select(characteristic, region, "eb" = post_mean, "ols" = ols_est) %>%
      gather(eb, ols, key = "type", value = "alpha") %>%
      group_by(characteristic) %>%
      mutate(sort_var = alpha[region == "us" & type == "ols"]) %>%
      ggplot(aes(reorder(characteristic, sort_var), alpha, colour = region)) +
      geom_point() +
      facet_wrap(~type, ncol = 1) +
      theme(
        axis.title.x = element_blank(),
        axis.text.x = element_text(size = 5, angle = 90, vjust = 0, hjust = 1)
      )
  }
  if (plot == "se") {
    op <- eb_act$factors %>% 
      mutate(ols_p025 = ols_est - 1.96 * ols_se) %>% 
      mutate(se_diff = (ols_se-post_sd) / ols_se) %>% 
      ggplot(aes(x = region, y = se_diff*100, colour = region)) +
      geom_boxplot() +
      expand_limits(y = 0) +
      theme(
        axis.title.x = element_blank(),
        legend.position = "none"
      ) +
      labs(y = "(SE_ols - SE_eb) / SE_ols * 100")
  }
  if (plot == "repl") {
    repl_table <- eb$factors %>%
      group_by(region) %>%
      summarise(
        repl_eb = mean(p025 > 0),
        repl_ols = mean(ols_est - 1.96*ols_se > 0)
      )
    print(repl_table)
    
    op <- eb$factors %>%
      group_by(region, hcl_label) %>%
      summarise(
        mean_alpha = mean(post_mean),
        rep_rate = mean(p025>0)
      ) %>%
      group_by(hcl_label) %>%
      mutate(sort_var = rep_rate[region == "US"]) %>%
      ggplot(aes(reorder(hcl_label, sort_var), rep_rate)) +
      geom_col() +
      facet_wrap(~region, ncol = 1) +
      theme(
        axis.title.x = element_blank(),
        axis.text.x = element_text(size = 7)
      )
  }
  if (plot == "significance") {
    op <- bind_rows(
      eb$factors %>% select(char_reg, characteristic, hcl_label, region, "alpha" = post_mean, "se" = post_sd) %>% mutate(type = "hlm"),
      eb$factors %>% select(char_reg, characteristic, hcl_label, region, "alpha" = ols_est, "se" = ols_se) %>% mutate(type = "ols")
    ) %>%
      mutate(significant = alpha - 1.96 * se > 0) %>%
      group_by(hcl_label) %>%
      mutate(
        sort_var = median(alpha[type == "ols" & region == "US"]),
        type = case_when(
          type == "hlm" ~ "Empirical Bayes",
          type == "ols" ~ "OLS"
        )
      ) %>%
      ggplot(aes(reorder(characteristic, sort_var), alpha, colour = significant)) +
      geom_point() +
      geom_errorbar(aes(ymin = alpha - 1.96*se, ymax = alpha + 1.96*se)) +
      geom_hline(yintercept = 0, colour = "black") +
      facet_wrap(region~type, ncol = 2) +
      labs(y = "Alpha", colour = "Significant") +
      theme(
        axis.title.x = element_blank(),
        axis.text.x = element_text(size = 5, angle = 90, vjust = 0, hjust = 1)
      )
  }
  if (plot == "shrinkage") {
    op <- eb$factors %>%
      select(region, characteristic, "hlm_alpha_mean" = post_mean, "ols_alpha" = ols_est) %>%
      gather(-characteristic, -region, key = "type", value = "alpha") %>%
      group_by(characteristic, region) %>%
      mutate(sort_var = sum(alpha * (type == "ols_alpha"))) %>%
      group_by(region) %>%
      mutate(
        rank = frank(sort_var, ties.method = "max") / 2,
        type = case_when(
          type == "hlm_alpha_mean" ~ "Empricial Bayes Posterior Mean",
          type == "ols_alpha" ~ "OLS Estimate"
        )
      ) %>%
      ggplot(aes(rank, alpha, shape = type, colour = type, group = type)) +
      geom_smooth(method = "lm", se = F, formula = "y ~ x") +
      geom_point() +
      facet_wrap(~region) +
      theme(
        axis.text.x = element_text(size = 7),
        text = element_text(size = 10)
      ) +
      labs(x = "Rank OLS Alpha", y = "Alpha", colour = "Type", shape = "Type")
  }
  
  if (plot == "comparison") {
    op <- bind_rows(
      eb$factors %>% select(char_reg, characteristic, hcl_label, region, "alpha" = post_mean, "se" = post_sd) %>% mutate(type = "hlm"),
      eb$factors %>% select(char_reg, characteristic, hcl_label, region, "alpha" = ols_est, "se" = ols_se) %>% mutate(type = "ols")
    ) %>%
      group_by(hcl_label) %>%
      mutate(
        sort_var = median(alpha[type == "ols" & region == "US"]),
        type = case_when(
          type == "hlm" ~ "Empirical Bayes",
          type == "ols" ~ "OLS"
        )
      ) %>%
      ggplot(aes(reorder(characteristic, sort_var), alpha, colour = hcl_label)) +
      geom_point() +
      geom_errorbar(aes(ymin = alpha - 1.96*se, ymax = alpha + 1.96*se)) +
      geom_hline(yintercept = 0, colour = "black") +
      facet_wrap(region~type, ncol = 2) +
      labs(y = "Alpha", colour = "Cluster") +
      theme(
        axis.title.x = element_blank(),
        axis.text.x = element_text(size = 5, angle = 90, vjust = 0, hjust = 1)
      )
  }
  if (plot == "cluster_density") {
    op <- eb$clusters %>% 
      group_by(hcl_label) %>% 
      nest() %>% 
      mutate(randoms = data%>% map(~rnorm(1000, mean = .x$post_mean, sd = .x$post_sd))) %>% 
      unnest(randoms) %>% 
      ggplot(aes(x = randoms, fill = hcl_label)) + 
      geom_density(alpha = 0.5)  #  + facet_wrap(~hcl_label)
  }
  print(op)
}


================================================
FILE: Analysis/1 - Prepare Data.R
================================================
# Prepare Support Data ---------------------------------
# Market Returns
market_returns <- fread(paste0(data_path, "/market_returns.csv"), colClasses = c("eom"="character"))
market_returns[, eom := eom %>% as.Date(format = "%Y-%m-%d")]
market_returns <- market_returns[, .(excntry, eom, mkt_vw_exc, stocks, me_lag1)]
market_returns <- market_returns[
  eom >= settings$start_date & eom <= settings$end_date &
    !(excntry %in% settings$country_excl) &   
    !(excntry == "PER" & eom == as.Date("1992-01-31") & mkt_vw_exc >= 8900) &  # Huge outlier 
    !(excntry == "VEN" & eom == as.Date("2018-02-28") & mkt_vw_exc < -1)]      # Something is clearly wrong

# Labels 
char_info <- readxl::read_xlsx("Factor Details.xlsx",
                               sheet = "details", range = "A1:N300") %>%
  select("characteristic"=abr_jkp, direction, significance, date_range = `in-sample period`, "hxz_group"=group) %>%
  filter(!is.na(characteristic)) %>%
  mutate(
    direction = direction %>% as.integer,
    sample_start = date_range %>% str_extract("^\\d+") %>% as.integer(),
    sample_end = date_range %>% str_extract("\\d+$") %>% as.integer()
  )
base_chars <- char_info$characteristic

# Country Classification
country_classification <- readxl::read_xlsx("Country Classification.xlsx", 
                                            sheet = "countries", range = "A1:C200") %>%
  select(excntry, msci_development, region) %>%
  filter(!is.na(excntry)) %>%
  setDT()

# Regions
region_info <- tibble(
  name = c("us", "developed", "emerging", "frontier", "world", "world_ex_us"),
  country_codes = list(
    "USA",
    country_classification[msci_development == "developed" & excntry != "USA"]$excntry,
    country_classification[msci_development == "emerging"]$excntry,
    country_classification[msci_development == "frontier"]$excntry,
    country_classification$excntry,
    country_classification[excntry != "USA"]$excntry
  ),
  countries_min = c(1, rep(settings$countries_min, 3), 1, 3)
)

# Prepare Data --------------------------------------------------------
# HML ----------------------
hml <- fread(paste0(data_path, "/hml.csv"), colClasses = c("eom"="character"))
hml[, eom := eom %>% as.Date(format = "%Y-%m-%d")]
# Choose weighting
hml[excntry == "USA", ret := case_when(
  settings$weighting$us == "vw" ~ ret_vw,
  settings$weighting$us == "ew" ~ ret_ew,
  settings$weighting$us == "vw_cap" ~ ret_vw_cap
)]
hml[excntry != "USA", ret := case_when(
  settings$weighting$global_ex_us == "vw" ~ ret_vw,
  settings$weighting$global_ex_us == "ew" ~ ret_ew,
  settings$weighting$global_ex_us == "vw_cap" ~ ret_vw_cap
)]
# Screens
hml <- hml %>%
  filter(
    characteristic %in% base_chars,
    eom >= settings$start_date & eom <= settings$end_date,
    !is.na(ret) & n_stocks_min >= settings$n_stocks_min,  # Min Stocks is the big one, it removes close to 50% of the obs. with n_stocks_min>=10. Perhaps it's too stringent.
    !(excntry %in% settings$country_excl)
  ) %>%
  select(-signal, -n_stocks_min)
# Set direction as original study
hml <- hml %>%
  left_join(char_info %>% select(characteristic, direction), by = "characteristic") %>%
  mutate(ret = ret * direction) %>%
  select(-ret_vw, -ret_ew, -ret_vw_cap)
# Ensure no Duplicates
if(hml[, .N, by = .(characteristic, excntry, eom)][, max(N)] > 1) {
  warning("THE DATA HAS DUPLICATES")
} 

# Regional Portfolios ------------------------------------------------
regional_data <- function(data, countries, weighting, countries_min, months_min, size_grps = F) {
  # Determine Country Weights
  weights <- market_returns[, .(excntry, eom, mkt_vw_exc, "country_weight" = case_when(
    weighting == "market_cap" ~ me_lag1,
    weighting == "stocks" ~ as.double(stocks),
    weighting == "ew" ~ 1)
  )]
  # Portfolio Return 
  pf <- data[excntry %in% countries]
  pf <- weights[pf, on = .(excntry, eom)]
  if (size_grps) {
    pf <- pf[, .(
      n = .N,
      ret = sum(ret*country_weight) / sum(country_weight),
      mkt_vw_exc = sum(mkt_vw_exc * country_weight) / sum(country_weight) 
    ), by = .(characteristic, size_grp, eom)]
  } else {
    pf <- pf[, .(
      n = .N,
      ret = sum(ret*country_weight) / sum(country_weight),
      mkt_vw_exc = sum(mkt_vw_exc * country_weight) / sum(country_weight) 
    ), by = .(characteristic, eom)]
  }
  # Minimum Requirement: Countries
  pf <- pf[n >= countries_min]
  # Minimum Requirement: Months
  pf[, months := .N, by = .(characteristic)]
  pf <- pf[months >= months_min]
  return(pf)
}
regional_pfs <- 1:nrow(region_info) %>% lapply(function(i) {
  info <- region_info[i, ]
  reg_pf <- hml %>% regional_data(countries = unlist(info$country_codes), weighting = settings$country_weighting,
                                  countries_min = info$countries_min, months_min = settings$months_min)
  reg_pf %>% mutate(region = info$name)
}) %>% bind_rows() 

# Characteristic Managed Portfolios ----------------------
cmp <- fread(paste0(data_path, "/cmp.csv"), colClasses = c("eom"="character"))
cmp[, eom := eom %>% as.Date(format="%Y-%m-%d")]

# Screens
cmp <- cmp %>%
  rename(ret = ret_weighted) %>%
  filter(
    characteristic %in% base_chars,
    eom >= settings$start_date & eom <= settings$end_date,
    !is.na(ret) & signal_weighted != 0 & n_stocks >= settings$n_stocks_min * 2, 
    !(excntry %in% settings$country_excl)
  ) %>%
  select(-signal_weighted, -n_stocks)
# Determine Direction
cmp <- cmp %>%
  left_join(char_info %>% select(characteristic, direction), by = "characteristic") %>%
  mutate(ret = ret * direction)
# Ensure no Duplicates
if(cmp[, .N, by = .(characteristic, excntry, size_grp, eom)][, max(N)] > 1) {
  warning("THE DATA HAS DUPLICATES")
} 
# Regional Portfolios
region_info_cmp <- region_info %>% filter(name == "us")
regional_pfs_cmp <- 1:nrow(region_info_cmp) %>% lapply(function(i) {
  info <- region_info[i, ]
  reg_pf <- cmp %>% regional_data(countries = unlist(info$country_codes), weighting = settings$country_weighting,
                                  countries_min = info$countries_min, months_min = settings$months_min, size_grps = T)
  reg_pf %>% mutate(region = info$name)
}) %>% bind_rows() 

# Regional Market Returns ---
regional_mkt_ret <- 1:nrow(region_info) %>% lapply(function(i) {
  info <- region_info[i, ]
  mkt <- market_returns[excntry %in% unlist(info$country_codes), .(n = .N, market = sum(mkt_vw_exc * me_lag1) / sum(me_lag1)), by = eom]
  mkt <- mkt[n >= info$countries_min][, n:= NULL]
  mkt %>% mutate(region = info$name)
}) %>% bind_rows() 

print(paste("Total Characteristics:", uniqueN(regional_pfs$characteristic)))


================================================
FILE: Analysis/2 - Determine Clusters.R
================================================
# Hierachical Clustering ----------------------------------------------
factor_hcl <- function(cor_mat, linkage = "ward.D", k, direction_bars = T) {
  dist_mat <- as.dist((1-cor_mat))
  # dist_mat <- as.dist(sqrt((1-cor_mat)*2)) # With ward.D2 gives the same clusters
  hcl <- dist_mat %>% 
    hclust(method=linkage)
  print(str_c("Cophenetic Correlation between Dendogram and Distance Matrix = ", 
              format(cor(cophenetic(hcl), dist_mat), digits = 2, nsmall = 2)))
  
  hcl_labels <- hcl %>%
    cutree(k = k) %>%
    as_tibble(rownames = "characteristic") %>%
    setDT() %>% 
    setnames(c("characteristic", "hcl"))

  hcl_col <- rep(colours_theme[c(1, 2, 3, 4, 5, 7, 9, 11)], ceiling(k/8))[1:k]
  
  dend <- hcl %>% 
    as.dendrogram() %>%
    dendextend::set("labels_col", value = hcl_col, k=k) %>%
    dendextend::set("branches_k_color", value = hcl_col, k=k) %>%
    dendextend::set("labels_cex", value = 0.5) %>%
    dendextend::set("branches_lty", 1) %>%
    dendextend::set("branches_lwd", 0.2) 
  dend %>% plot(horiz=T)
  
  return_list <- list(
    "cor" = cor_mat,
    "labels" = hcl_labels,
    "dend" = dend
  )
  
  if (direction_bars) {
    bar_colours <- tibble("characteristic" = colnames(cor_mat)) %>%
      left_join(char_info %>% select(characteristic, direction), by = "characteristic") %>% 
      mutate(col_dir = if_else(direction == 1, "black", "white"))
    colored_bars(colors = bar_colours %>% select(col_dir), dend = dend, rowLabels = c("Long High"), y_shift = 3, horiz = T)
    return_list$bar_colours <- bar_colours
  }
  return(return_list)
}

hcl_input <- function(data, ret_type = "alpha", ...) { # ret_type %in% c("raw", "alpha")
  data <- copy(data)  # Avoid modifying in place
  if (ret_type == "raw") {
    data[, ret_hcl := ret]
  }
  if (ret_type == "alpha") {
    data[, ret_hcl := ret - mkt_vw_exc * cov(ret, mkt_vw_exc)/var(mkt_vw_exc), by = characteristic]
  }
  data %>%
    select(characteristic, eom, ret_hcl) %>%
    spread(key = characteristic, value = ret_hcl) %>%
    select(-eom) %>%
    cor(...)  
}

# US Clusters -----------
clusters <- regional_pfs %>%
  filter(region == settings$hcl$region & year(eom) >= settings$hcl$start_year) %>%
  hcl_input(ret_type = settings$hcl$ret_type, method = settings$hcl$cor_method, use = "pairwise.complete.obs") %>%
  factor_hcl(linkage = settings$hcl$linkage, k = settings$hcl$k, direction_bars = T)

# Cluster Labels
if (settings$weighting$us == "vw_cap" & settings$hcl$k == 13 & settings$hcl$region == "us" & settings$hcl$start_year == 1975) {
  clusters$labels <- clusters$labels %>% 
    mutate(
      hcl_label = case_when(
        hcl == 1 ~ "Low Leverage",  
        hcl == 2 ~ "Investment",     
        hcl == 3 ~ "Size", 
        hcl == 4 ~ "Value",  
        hcl == 5 ~ "Quality", 
        hcl == 6 ~ "Low Risk", 
        hcl == 7 ~ "Debt Issuance", 
        hcl == 8 ~ "Seasonality", 
        hcl == 9 ~ "Accruals",   
        hcl == 10 ~ "Profitability",
        hcl == 11 ~ "Profit Growth",
        hcl == 12 ~ "Short-Term Reversal",
        hcl == 13 ~ "Momentum",
        TRUE ~ as.character(hcl))
    ) 
} else {
  clusters$labels <- clusters$labels %>% 
    mutate(hcl_label = hcl)
}
# Output
cluster_labels <- clusters$labels %>%
  select(-hcl)


================================================
FILE: Analysis/3 - Analysis.R
================================================
# Empirical Bayes Estimation --------------------------
# search_list: c(regions, type, layers, size_grp)
search_list <- list(
  "us" = list("us", "hml", 2), 
  "developed" = list("developed", "hml", 2),
  "emerging" = list("emerging", "hml", 2),
  "all" = list(c("us", "developed", "emerging"), "hml", 3),
  "world" = list("world", "hml", 2),
  "world_ex_us" = list("world_ex_us", "hml", 2),
  "us_mega" = list("us", "cmp", 2, "mega"),
  "us_large" = list("us", "cmp", 2, "large"),
  "us_small" = list("us", "cmp", 2, "small"),
  "us_micro" = list("us", "cmp", 2, "micro"),
  "us_nano" = list("us", "cmp", 2, "nano")
)
eb_est <- search_list %>% sapply(simplify = F, USE.NAMES = T, function(x) {
  print(paste("Region:", x[[1]]))
  regions <- x[[1]]
  if (x[[2]] == "cmp") {
    base_data <- copy(regional_pfs_cmp) %>% filter(size_grp == x[[4]])
  }
  if (x[[2]] == "hml") {
    base_data <- copy(regional_pfs)
  }
  # Prepare Data
  data <- base_data %>% 
    filter(eom >= settings$start_date & eom <= settings$end_date) %>%
    filter(region %in% regions) %>% 
    eb_prepare(
      scale_alpha = settings$eb$scale_alpha, 
      overlapping = settings$eb$overlapping 
    )
  # Run Empirical Bayes
  op <- data %>% 
    emp_bayes( 
      cluster_labels = cluster_labels, 
      min_obs = settings$eb$min_obs,
      fix_alpha = settings$eb$fix_alpha, 
      bs_cov = settings$eb$bs_cov, 
      layers = x[[3]], 
      shrinkage = settings$eb$shrinkage, 
      cor_type = settings$eb$cor_type, 
      bs_samples = settings$eb$bs_samples, 
      seed = settings$seed
    )
  # Output
  return(op)
})

# Simulations EB vs. BY --------------
# Simulations
if (update_sim) {
  # Values from Data
  pairwise_cor <- eb_est$us$input$long %>%
    select(characteristic, eom, ret_neu) %>%
    spread(key = characteristic, value = ret_neu) %>%
    select(-eom) %>%
    cor(use = "pairwise.complete.obs") 
  
  cor_value <- pairwise_cor %>%
    as_tibble(rownames = "char1") %>%
    gather(-char1, key = "char2", value = "cor") %>%
    left_join(cluster_labels %>% select(characteristic, "hcl1" = hcl_label), by = c("char1"="characteristic")) %>%
    left_join(cluster_labels %>% select(characteristic, "hcl2" = hcl_label), by = c("char2"="characteristic")) %>%
    filter(char1 != char2) %>%
    mutate(same_cluster = (hcl1 == hcl2)) %>%
    group_by(same_cluster) %>%
    summarise(avg_cor = mean(cor))
  
  # Time periods
  med_months <- eb_est$us$input$long %>% group_by(characteristic) %>% summarise(n = n()) %>% pull(n) %>% median()
  
  data <- list(
    yrs = round(med_months / 12),
    cor_within = cor_value %>% filter(same_cluster == T) %>% pull(avg_cor) %>% round(digits = 2),
    cor_across = cor_value %>% filter(same_cluster == F) %>% pull(avg_cor) %>% round(digits = 2)
  )
  # Simulation Settings
  set.seed(settings$seed)
  sim <- list(
    "alpha_0" = 0,
    "t" = 12*70,      # Median amount of data
    "clusters" = 13,
    "fct_pr_cl" = 10,
    "corr_within" = 0.58, 
    "corr_across" = 0.02,
    "n_sims" = 10000,
    "tau_c" = c(0.01, seq(from = 0.05, to = 0.5, by = 0.05)),
    "tau_w" = c(0.01, 0.2)
  )
  sim$se <- (10/sqrt(12))/sqrt(sim$t)
  sim$n <- sim$clusters * sim$fct_pr_cl
  
  # Check settings are consistent with data [Alert if there is a significant difference]
  if (abs(sim$t - data$yrs*12) > 12 | abs(sim$corr_within - data$cor_within) > 0.05 | abs(sim$corr_across - data$cor_across) > 0.05) {
    warning("SIMULATION AND DATA VALUES ARE NOT CONSISTENT!")
    print(data)
    print(list("yrs"=sim$t/12, "corr_within"=sim$corr_within, "corr_across"=sim$corr_across))
  }
  simulation <- sim_mt_control(sim_settings = sim)
  simulation %>% saveRDS(file = paste0(object_path, "/fdr_sim.RDS"))
} else {
  simulation <- readRDS(file = paste0(object_path, "/fdr_sim.RDS"))
}
# False Discovery Rate
model_fdr <- fdr_sim(t_low = 0, a_vec = eb_est$us$factor_mean, a_cov = eb_est$us$factor_cov, n_sim = 10000, seed = settings$seed)
# Multiple Testing Adjustments
mt <- multiple_testing(eb_all = eb_est$all, eb_world = eb_est$world)
# Tangency Portfolios -----------------------------
# Regions
tpf_world <- eb_est$world$input$long %>% tpf_cluster(mkt_region = "world", orig_sig = T, min_date = settings$tpf$start$world, n_boots = settings$tpf$bs_samples, shorting = settings$tpf$shorting, seed = settings$seed)
tpf_us <- eb_est$us$input$long %>% tpf_cluster(mkt_region = "us", orig_sig = T, min_date = settings$tpf$start$us, n_boots = settings$tpf$bs_samples, shorting = settings$tpf$shorting, seed = settings$seed)
tpf_dev <- eb_est$developed$input$long %>% tpf_cluster(mkt_region = "developed", orig_sig = T, min_date = settings$tpf$start$developed, n_boots = settings$tpf$bs_samples, shorting = settings$tpf$shorting, seed = settings$seed)
tpf_emer <- eb_est$emerging$input$long %>% tpf_cluster(mkt_region = "emerging", orig_sig = T, min_date = settings$tpf$start$emerging, n_boots = settings$tpf$bs_samples, shorting = settings$tpf$shorting, seed = settings$seed)
# Size Groups
tpf_size <- c("mega", "large", "small", "micro", "nano") %>% lapply(function(x) {
  eb_est[[paste0("us_", x)]]$input$long %>% tpf_cluster(mkt_region = "us", orig_sig = T, min_date = settings$tpf$start$size_grps, n_boots = settings$tpf$bs_samples, shorting = settings$tpf$shorting, seed = settings$seed) %>%
    mutate(size_grp = x)
}) %>%
  bind_rows()
# Single Factor TPF
tpf_factors <- prepare_tpf_factors(region = settings$tpf_factors$region, orig_sig_values = settings$tpf_factors$orig_sig, 
                                   start = settings$tpf_factors$start, scale = settings$tpf_factors$scale)
opt_s <- tpf_factors$long %>% optimal_shrinkage(k = settings$tpf_factors$k)

# Posterior Over time -----------------
ot_region <- "world"
if (update_post_over_time) {
  for (fix_taus in c(T,F)) {
    if (fix_taus) {
      fixed_priors <- list(
        "alpha" = eb_est[[ot_region]]$mle %>% filter(estimate == "alpha") %>% pull(ml_est), 
        "tau_c" = eb_est[[ot_region]]$mle %>% filter(estimate == "tau_c") %>% pull(ml_est), 
        "tau_s" = eb_est[[ot_region]]$mle %>% filter(estimate == "tau_s") %>% pull(ml_est)
      )
    } else {
      fixed_priors <- NULL
    }
    
    periods <- sort(unique(regional_pfs$eom))
    periods <- periods[month(periods) == 12]  # Only estimate once per year
    
    time_chars <- regional_pfs %>% 
      filter(region == ot_region & eom <= as.Date("1960-12-31")) %>% 
      group_by(characteristic) %>%
      filter(n() >= settings$eb$min_obs) %>%
      pull(characteristic) %>% 
      unique()
    
    posterior_over_time <- periods[periods >= as.Date("1960-12-31")] %>% lapply(function(end_date) { 
      print(end_date)
      # Prepare Data
      data <- regional_pfs %>% 
        filter(characteristic %in% time_chars) %>%
        filter(eom >= settings$start_date & eom <= end_date) %>%
        filter(region == ot_region) %>% 
        eb_prepare(
          scale_alpha = settings$eb$scale_alpha, 
          overlapping = settings$eb$overlapping 
        )
      # Run Empirical Bayes
      eb_act <- data %>% 
        emp_bayes( 
          cluster_labels = cluster_labels, 
          min_obs = settings$eb$min_obs,
          fix_alpha = settings$eb$fix_alpha, 
          bs_cov = settings$eb$bs_cov, 
          layers = 2, 
          shrinkage = settings$eb$shrinkage, 
          cor_type = settings$eb$cor_type, 
          bs_samples = 1000, 
          priors = fixed_priors,
          seed = settings$seed
        )
      eb_act$input <- NULL
      eb_act$end_date <- end_date
      return(eb_act)
    }) 
    if (fix_taus) {
      posterior_over_time %>% saveRDS(file = paste0(object_path, "/posterior_over_time.RDS"))
    } else {
      posterior_over_time %>% saveRDS(file = paste0(object_path, "/posterior_over_time_flex.RDS"))
    }
  }
} 
posterior_over_time <- readRDS(file = paste0(object_path, "/posterior_over_time.RDS"))
posterior_over_time_flex <- readRDS(file = paste0(object_path, "/posterior_over_time_flex.RDS"))

# Size Dimension
eb_us_size <- c("mega", "large", "small", "micro", "nano") %>% lapply(function(x) {
  eb_est[[str_c("us_", x)]]$factors %>% mutate(size_grp = x)
}) %>% 
  bind_rows() %>%
  mutate(
    size_grp = str_to_title(size_grp),
    size_grp = size_grp %>% factor(levels = c("Mega", "Large", "Small", "Micro", "Nano"))
  )

# In-Sample / Out-of-Sample ------------------------------------
is_oos <- c("pre", "post", "pre_post") %>% sapply(simplify = F, USE.NAMES = T, function(t) {
  data <- eb_est$us$input$long %>% prepare_is_oos(min_obs = 60, ret_scaled = "all", orig_group = T, type = t, print=T)
  regs <- data %>%
    group_by(characteristic, period, n_is, n_oos) %>%
    nest() %>%
    mutate(
      fit = data %>% map(~lm(ret_adj ~ mkt_vw_exc, data = .x)),
      tidied = fit %>% map(tidy)
    ) %>% 
    unnest(tidied) %>%
    filter(term == "(Intercept)") %>%
    select(characteristic, period, n_is, n_oos, estimate) %>%
    spread(key = period, value = estimate)
  list(data=data, regs=regs)
})

# Economi Benefit of More Powerful Multiple Comparison 
if (update_post_is) {
  periods <- sort(unique(regional_pfs$eom))
  periods <- periods[month(periods) == 12 & year(periods) >= 1959]  # Only estimate once per year
  
  posterior_is <- periods %>% lapply(function(end_date) { 
    print(paste("Date", end_date, "-" , match(end_date, periods), "out of", length(periods)))
    # Prepare Data
    data <- regional_pfs %>% 
      filter(eom >= settings$start_date & eom <= end_date) %>%
      filter(region == "us") %>% 
      eb_prepare(
        scale_alpha = settings$eb$scale_alpha, 
        overlapping = settings$eb$overlapping 
      )
    # Run Empirical Bayes
    eb_act <- data %>% 
      emp_bayes( 
        cluster_labels = cluster_labels, 
        min_obs = settings$eb$min_obs,
        fix_alpha = settings$eb$fix_alpha, 
        bs_cov = settings$eb$bs_cov, 
        layers = 2, 
        shrinkage = settings$eb$shrinkage, 
        cor_type = settings$eb$cor_type, 
        bs_samples = 1000,
        seed = settings$seed
      )
    # Output 
    eb_act$factors %>% mutate(est_date = end_date)
  }) %>% bind_rows()
  posterior_is %>% saveRDS(file = paste0(object_path, "/posterior_is.RDS"))
} else {
  posterior_is <- readRDS(file = paste0(object_path, "/posterior_is.RDS"))
}
sig_oos_pfs <- posterior_is %>% trading_on_significance()

# Harvey et al (2016) Simulation - Baseline ------------------------------
# We use the baseline specification from table 5 - Panel A where the average correlation is 0 (the average correlation among factors in our data is 7%)
# 1300 * (1-0.396)  The harvey et al numbers are m=1297 and m_true=783
harvey_base <- list(
  alpha_0 = 0,
  t = 70*12,  
  ret = 4.4 / 12,
  vol = 10 / sqrt(12),
  cl = 26,
  cl_true = 16,
  fct_pr_cl = 50,
  corr_across = 0.02,
  corr_within = 0.58,
  tau_ws = c(0.21),         # We estimate it at 0.21
  n_sims = 50,
  fix_alpha = T
)
harvey_base$se <- harvey_base$vol / sqrt(harvey_base$t)
harvey_base$n <- harvey_base$cl * harvey_base$fct_pr_cl
harvey_base$n_true <- harvey_base$cl_true * harvey_base$fct_pr_cl

if (update_harvey_baseline) {
  harvey_base_res <- harvey_et_al_sim(sim_settings = harvey_base, seed = settings$seed)
  harvey_base_res <- list("settings" = harvey_base, "sim" = harvey_base_res)
  harvey_base_res %>% saveRDS(file = paste0(object_path, "/harvey_res_baseline.RDS"))
} else {
  harvey_base_res <- readRDS(file = paste0(object_path, "/harvey_res_baseline.RDS"))
}

# Harvey et al (2016) Simulation - Worst Case
# We use the worst cases specification from table 5 - Panel B where the average correlation is 0 (the average correlation among factors in our data is 7%)
# 2500 * (1-0.683) = 800  The harvey numbers are m=2458 and m_true=779
harvey_worst <- list(
  alpha_0 = 0,
  t = 70*12,                # Median number of years for US factor
  ret = 4.4 / 12,
  vol = 10 / sqrt(12),
  cl = 50,
  cl_true = 16,
  fct_pr_cl = 50,
  corr_across = 0.02,
  corr_within = 0.58,
  tau_ws = c(0.21),         # We estimate it at 0.21 Same as what we estimate
  n_sims = 50,
  fix_alpha = T
)
harvey_worst$se <- harvey_worst$vol / sqrt(harvey_worst$t)
harvey_worst$n <- harvey_worst$cl * harvey_worst$fct_pr_cl
harvey_worst$n_true <- harvey_worst$cl_true * harvey_worst$fct_pr_cl

if (update_harvey_worstcase) {
  harvey_worst_res <- harvey_et_al_sim(sim_settings = harvey_worst, seed = settings$seed)
  harvey_worst_res <- list("settings" = harvey_worst, "sim" = harvey_worst_res)
  harvey_worst_res %>% saveRDS(file = paste0(object_path, "/harvey_res_worstcase.RDS"))
} else {
  harvey_worst_res <- readRDS(file = paste0(object_path, "/harvey_res_worstcase.RDS"))
}

# Estimate parameters on OOS data -----------
if (FALSE) {
  reg <- "us"
  # Prepare Data
  data <- regional_pfs %>% 
    filter(eom >= settings$start_date & eom <= settings$end_date) %>%
    filter(region==reg) %>%
    left_join(char_info %>% select(characteristic, sample_start, sample_end), by = "characteristic") %>%
    filter(year(eom) < sample_start | year(eom) > sample_end) %>%
    eb_prepare(
      scale_alpha = settings$eb$scale_alpha, 
      overlapping = settings$eb$overlapping # If we start in 1955-02-28 we lose 12 factors relative to starting in 1972-11-30
    )
  # Run Empirical Bayes
  op <- data %>% 
    emp_bayes( 
      cluster_labels = cluster_labels, 
      min_obs = settings$eb$min_obs,
      fix_alpha = settings$eb$fix_alpha, 
      bs_cov = settings$eb$bs_cov, 
      layers = 2, 
      shrinkage = settings$eb$shrinkage, 
      cor_type = settings$eb$cor_type, 
      bs_samples = settings$eb$bs_samples,
      seed = settings$seed
    )
  # OOS-replication rate
  op$factors %>%
    left_join(char_info %>% select(characteristic, significance), by = "characteristic") %>%
    filter(significance == 1) %>%
    summarise(
      n = n(),
      sd_ols = sd(ols_est),
      eb_rr = mean(p025 > 0),
      eb_ols = mean(ols_est - 1.96*ols_se > 0)
    )
  # OOS Hyperparameters
  op$mle %>% mutate(estimate = if_else(estimate == "tau_s", "tau_w", estimate))
  # Full sample Tau's
  eb_est[[reg]]$mle %>% mutate(estimate = if_else(estimate == "tau_s", "tau_w", estimate))
  # Replication rate with OOS hyperparameters
  repl_rate <- function(chars, alphas, sigma, alpha0, tau_c, tau_w, cluster_labels, char_info) {
    # Alpha zero vector
    alpha0_vec <- rep(alpha0, length(alphas))
    # Signal Membership
    cm <- tibble(characteristic = chars, "alpha"= alphas) %>% 
      left_join(cluster_labels, by = "characteristic") 
    m <- cm %>%
      mutate(cm = 1) %>%
      select(characteristic, hcl_label, cm) %>%
      spread(key = hcl_label, value = cm) %>% 
      select(-characteristic) %>% 
      as.matrix()
    m[is.na(m)] <- 0
    mm <- m %*% t(m)
    # Omega 
    omega <- diag(length(alphas)) * tau_w^2 + mm * tau_c^2
    # Posterior
    post_cov <- solve(solve(omega) + solve(sigma))
    post_alpha <- post_cov %*% (solve(omega) %*% alpha0_vec + solve(sigma) %*% alphas)
    # Replication Rate
    tibble(
      characteristic = chars,
      alpha = drop(post_alpha),
      se = sqrt(diag(post_cov))
    ) %>%
      left_join(char_info, by = "characteristic") %>%
      filter(significance == T) %>%
      summarise(
        repl_rate = mean(alpha - 1.96*se > 0)
      )
  }
  # Full Sample Hyper-parameters
  repl_rate(
    chars = eb_est[[reg]]$factors$characteristic, 
    alphas = eb_est[[reg]]$factors$ols_est, 
    sigma = eb_est[[reg]]$sigma,
    alpha0 = eb_est[[reg]]$mle %>% filter(estimate == "alpha") %>% pull(ml_est),
    tau_c = eb_est[[reg]]$mle %>% filter(estimate == "tau_c") %>% pull(ml_est),
    tau_w = eb_est[[reg]]$mle %>% filter(estimate == "tau_s") %>% pull(ml_est),
    cluster_labels = cluster_labels, 
    char_info = char_info
  )
  # OOS Hyper-parameters
  repl_rate(
    chars = eb_est[[reg]]$factors$characteristic, 
    alphas = eb_est[[reg]]$factors$ols_est, 
    sigma = eb_est[[reg]]$sigma,
    alpha0 = op$mle%>% filter(estimate == "alpha") %>% pull(ml_est),
    tau_c = op$mle %>% filter(estimate == "tau_c") %>% pull(ml_est),
    tau_w = op$mle %>% filter(estimate == "tau_s") %>% pull(ml_est),
    cluster_labels = cluster_labels, 
    char_info = char_info
  )
}


================================================
FILE: Analysis/4 - Output.R
================================================
# Determine Cluster Order
cluster_order <- c("Accruals", "Debt Issuance", "Investment", "Short-Term Reversal", "Value",
                   "Low Risk", "Quality", "Momentum", "Profitability", "Profit Growth",
                   "Seasonality", "Size", "Low Leverage")

# Collect all output in list
output <- list(figures = list(), tables = list())

# Headline Replication Rate
headline_rr <- eb_est$us$factors %>% left_join(char_info, by = "characteristic") %>% filter(significance == 1) %>% summarise(rr = mean(p025>0)) %>% pull(rr)

# Figures --------------------------------------
# HCL
output$figures$hcl_us <- function(tex = F) {
  par(mar = c(3,2,1,10), cex = 1) 
  c <- 1.2
  x <- 37/2
  label_func <- function(x) unique(clusters$labels$hcl_label)[unique(clusters$labels$hcl_label) %>% str_detect(x)]
  
  clusters_tex <- copy(clusters$dend)
  if (tex == T) {
    labels(clusters_tex) <- labels(clusters_tex) %>% str_replace_all("_", "\\\\_")
  }
  
  clusters_tex %>% plot(horiz=T)
  colored_bars(colors = clusters$bar_colours %>% select(col_dir), dend = clusters_tex, rowLabels = c("Long High"), 
               y_shift = 11/2, horiz = T)
  # Labels
  if (settings$hcl$k == 13) {
    text(x = x, y = 151, label_func("Short-Term Reversal"), cex = c, col = colours_theme[5], adj = 0)   #gold-9
    text(x = x, y = 141, label_func("Profitability"), cex = c, col = colours_theme[4], adj = 0)  #lightgreen-7
    text(x = x, y = 129, label_func("Low Risk"), cex = c, col = colours_theme[3], adj = 0)  #purple-5
    text(x = x, y = 112, label_func("Value"), cex = c, col = colours_theme[2], adj = 0)       #orange-4
    text(x = x, y = 90, label_func("Investment"), cex = c, col = colours_theme[1], adj = 0)           #darkgreen-3
    text(x = x, y = 73, label_func("Seasonality"), cex = c, col = colours_theme[11], adj = 0)            #orange-4 
    text(x = x, y = 63, label_func("Debt Issuance"), cex = c, col = colours_theme[9], adj = 0)       #red-2
    text(x = x, y = 57, label_func("Size"), cex = c, col = colours_theme[7], adj = 0)            #blue-1  
    text(x = x, y = 51, label_func("Accruals"), cex = c, col = colours_theme[5], adj = 0)   #black-11
    text(x = x, y = 43, label_func("Low Leverage"), cex = c, col = colours_theme[4], adj = 0)            #lightgreen-7
    text(x = x, y = 30, label_func("Profit Growth"), cex = c, col = colours_theme[3], adj = 0)   #purple-5
    text(x = x, y = 22, label_func("Momentum"), cex = c, col = colours_theme[2], adj = 0)               #darkgreen-3
    text(x = x, y = 12, label_func("Quality"), cex = c, col = colours_theme[1], adj = 0)        #blue-1
  }
}

# Cluster Validation
output$figures$hcl_us_val <- clusters$cor %>% cluster_val(labels = clusters$labels, op_format = "pdf")

# Literature comparison
print(output$figures$lit_comp <- eb_est$us %>% plot_lit_comp(mt_res = mt, eb_world = eb_est$world, excl_insig=T))

# Comparing Multiple Testing with Empirical Bayes
c(output$figures$mt_factors, output$figures$mt_summary) %<-% plot_mt_eb_comp(
  mt = mt, eb_all = eb_est$all, eb_us = eb_est$us, eb_developed = eb_est$developed, eb_world = eb_est$world,
  eb_emerging = eb_est$emerging, mts = c("OLS", "Bonferroni", "BY"), 
  regs = c("us", "developed", "emerging", "world"), se_methods = c("OLS", "BY", "EB - Region", "EB - All"),
  se_regions = "us")

# Replication Rate by Region
output$figures$gl_by_cluster <- plot_repl_region(eb_all = eb_est$all, cluster_order = cluster_order)

# Global Factor Posterior
output$figures$gl_by_factor <- eb_est$world %>% plot_factor_post(orig_sig = T, cluster_order = cluster_order)

# CI Many Factors
output$figures$ci_many_fcts <- plot_many_factors()

# Tangency Portfolio - US
output$figures$tpf <- tpf_us %>% plot_tpf(cluster_order = cluster_order, ci_low = 0.05, ci_high = 0.95)

# Tangency Portfolio - Regions
output$figures$tpf_regions <- plot_tpf_region(tpf_us = tpf_us, tpf_dev = tpf_dev, tpf_emer = tpf_emer, cluster_order = cluster_order, ci_low = 0.05, ci_high = 0.95)

# Tangency Portfolio - Size Groups
output$figures$tpf_size <- tpf_size %>% plot_tpf_size(cluster_order = cluster_order, ci_low = 0.05, ci_high = 0.95)

# Posterior over Time - Fixed Taus
output$figures$overtime <- posterior_over_time %>% plot_over_time(orig_sig = T, ols_incl = T, lb = 5, bw=F)
output$figures$overtime_bw <- posterior_over_time %>% plot_over_time(orig_sig = T, ols_incl = T, lb = 5, bw=T)

# Posterior over Time - Flexible Taus
output$figures$overtime_flex <- posterior_over_time_flex %>% plot_over_time(orig_sig = T, ols_incl = F, lb=5, bw=F)
output$figures$overtime_flex_bw <- posterior_over_time_flex %>% plot_over_time(orig_sig = T, ols_incl = F, lb=5, bw=T)

# Posterior over Time - Flexible Taus - Plot taus
output$figures$overtime_flex_taus <- posterior_over_time_flex %>% plot_taus_over_time() 

# By Size - Overall
output$figures$size_overall <- eb_us_size %>% plot_size_overall(flipped = T, text = F)

# By Size - Clusters
output$figures$size_clusters <- eb_us_size %>% plot_size_clusters(cluster_order = cluster_order)

# Model - False Discovery Rate
output$figures$model_fdr <- model_fdr %>% plot_fdr()

# Simulation - False Discovery Rate
output$figures$sim_fdr <- simulation %>% plot_sim_fdr()

# US verus world factor
output$figures$world_vs_us <- plot_world_vs_us(eb_us = eb_est$us, eb_world_ex_us = eb_est$world_ex_us)

# In-sample vs. OOS and Post
c(output$figures$is_pre, output$figures$is_post, output$figures$is_oos) %<-% plot_is_oos_post(is_oos = is_oos, type = "GLS") 

# In-sample vs. OOS and Post - quadratic
c(output$figures$is_pre_quad, output$figures$is_post_quad, output$figures$is_oos_quad) %<-% plot_is_oos_post_quad(is_oos = is_oos, type = "GLS")

# Effect Sizes
c(output$figures$effect_world, output$figures$effect_regions, output$figures$effect_size) %<-% plot_effects(type = "ols", orig_sig = T, cluster_order = cluster_order)

# Replicateion Rate by Cluster - US
output$figures$repl_cluster_us <- eb_est$us$factors %>% plot_repl_cluster(orig_sig = T, cluster_order = cluster_order)

# Simulation benchmarked to Harvey et al (2016)
if (eb_est$us$mle %>% filter(estimate == "tau_s") %>% pull(ml_est) %>% round(2) != 0.21) {
  warning("Tau_w in Harvey et al simulation is inconsistent with data!!")
}
output$figures$sim_harvey <- plot_harvey(harvey_base_res = harvey_base_res, harvey_worst_res = harvey_worst_res, tau_ws = 0.21, act_rr = headline_rr)
output$figures$sim_harvey_robustness <- plot_harvey(harvey_base_res = harvey_base_res, harvey_worst_res = harvey_worst_res, tau_ws = c(0.1, 0.21, 0.3), act_rr = headline_rr)

# TPF Single Factors 
output$figures$tpf_factors_one_cluster <- tpf_factors$wide %>% plot_tpf_one_cluster(cluster_labels = cluster_labels, s = opt_s)
output$figures$tpf_factors_excl_one <- tpf_factors$wide %>% plot_tpf_excl_cl(cluster_labels = cluster_labels, s = opt_s)
output$figures$tpf_factors_imp <- tpf_factors$wide %>% plot_tpf_factor_imp(cluster_labels = cluster_labels, s = opt_s)
output$figures$tpf_factors_imp_cluster <- tpf_factors$wide %>% plot_tpf_factor_imp_cluster(cluster_labels = cluster_labels, s = opt_s)
tpf_evol <- tpf_factors$long %>% plot_tpf_evolution(data_wide = tpf_factors$wide, char_info = char_info, orig_sig_values = settings$tpf_factors$orig_sig, s = opt_s)
output$figures$tpf_evolution <- tpf_evol$plot


# Cumulative returns OOS - Marginally significant factors
output$figures$marg_sig_oos <- sig_oos_pfs %>% plot_sig_oos(sig_type = "marg_sig", cutoff_2012 = T, first_date = as.Date("1990-01-01"), leg_pos = c(.85, .55)) # Also include table for caption

# Cummulative returns OOS - EB significant factors
output$figures$eb_sig_oos <- sig_oos_pfs %>% plot_sig_oos(sig_type = "eb_sig", cutoff_2012 = F, first_date = as.Date("1990-01-01"), leg_pos = c(.85, .40)) # Also include table for caption


# Save Figures as Pictures -------------------------
if (save_figures) {
  # Base settings
  output_fig <- function(path, name, format, width, height) {
    file <- paste0(path, "/", name, ".eps")
    ggsave(file=file, width = width, height = height, units = "in", dpi = 300)
  }
  
  h <- 5
  w <- 9
  # For main text -----------------------------------------
  # Figure 1 ------
  output$figures$lit_comp
  output_fig(path=output_path, name = "fig1", width = w + 1, height = h)
  dev.off()
  
  output$figures$lit_comp + scale_fill_manual(values = c("black", "grey35"))
  output_fig(path=output_path, name = "fig1_bw", width = w + 1, height = h)
  dev.off()
  
  # Figure 2 -----
  output$figures$marg_sig_oos
  output_fig(path=output_path, name = "fig2", width = w, height = h*2/3)
  dev.off()
  
  output$figures$marg_sig_oos + scale_colour_manual(values = c("black", "black"))
  output_fig(path=output_path, name = "fig2_bw", width = w, height = h*2/3)
  dev.off()
  
  # Figure 3 -----
  output$figures$sim_fdr
  output_fig(path=output_path, name = "fig3", format = format, width = w, height = h)
  dev.off()
  
  output$figures$sim_fdr + scale_colour_manual(values = c("black", "black", "black"))
  output_fig(path=output_path, name = "fig3_bw", format = format, width = w, height = h)
  dev.off()
  
  
  # Figure 4 -----
  output$figures$mt_factors + theme(text = element_text(size = 13), axis.text.x = element_blank())
  output_fig(path=output_path, name = "fig4", format = format, width = w, height = h)
  dev.off()
  
  output$figures$mt_factors + theme(text = element_text(size = 13), axis.text.x = element_blank()) + 
    scale_colour_manual(values = rep("black", 3))
  output_fig(path=output_path, name = "fig4_bw", format = format, width = w, height = h)
  dev.off()
  
  # Figure 5a -----
  output$figures$size_overall + theme(
    axis.title.x = element_text(size = 17),
    axis.text.x = element_text(size = 17),
    axis.text.y = element_text(size = 17))
  output_fig(path=output_path, name = "fig5a", format = format, width = w, height = h)
  dev.off()
  
  output$figures$size_overall + theme(
    axis.title.x = element_text(size = 17),
    axis.text.x = element_text(size = 17),
    axis.text.y = element_text(size = 17)) + geom_col(fill="grey35")
  output_fig(path=output_path, name = "fig5a_bw", format = format, width = w, height = h)
  dev.off()
  
  # Figure 5b -----
  output$figures$repl_cluster_us + theme(
    axis.text.y = element_text(size = 13), 
    axis.text.x = element_text(size = 14), 
    text = element_text(size = 14))
  output_fig(path=output_path, name = "fig5b", format = format, width = w, height = h)
  dev.off()
  
  output$figures$repl_cluster_us + theme(
    axis.text.y = element_text(size = 13), 
    axis.text.x = element_text(size = 14), 
    text = element_text(size = 14)) + scale_fill_manual(values = rep("grey35", 13))
  output_fig(path=output_path, name = "fig5b_bw", format = format, width = w, height = h)
  dev.off()
  
  # Figure 6 -----
  output$figures$mt_summary + 
    theme(
      text = element_text(size = 13),
      axis.title.x = element_blank(), 
      axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 9)
    )
  output_fig(path=output_path, name = "fig6", format = format, width = w, height = h)
  dev.off()
  
  output$figures$mt_summary + 
    theme(
      text = element_text(size = 13),
      axis.title.x = element_blank(), 
      axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 9)
    ) +
    scale_fill_grey()
  output_fig(path=output_path, name = "fig6_bw", format = format, width = w, height = h)
  dev.off()
  
  # Figure 7 -----
  output$figures$world_vs_us + theme(text = element_text(size = 13)) 
  output_fig(path=output_path, name = "fig7", format = format, width = h, height = h)
  dev.off()
  
  output$figures$world_vs_us + theme(text = element_text(size = 13)) + scale_colour_manual(values = c("black", "grey35"))
  output_fig(path=output_path, name = "fig7_bw", format = format, width = h, height = h)
  dev.off()
  
  # Figure 8a -----
  output$figures$is_pre + theme(
    text = element_text(size = 12), 
    plot.title = element_text(size = 10, vjust = -2), 
    plot.subtitle = element_text(size = 8, vjust = 0),
    plot.margin = unit(c(0,0,0,0), "cm")
  ) 
  output_fig(path=output_path, name = "fig8a", format = format, width = w/3, height = w/3)
  dev.off()
  
  output$figures$is_pre + theme(
    text = element_text(size = 12), 
    plot.title = element_text(size = 10, vjust = -2), 
    plot.subtitle = element_text(size = 8, vjust = 0),
    plot.margin = unit(c(0,0,0,0), "cm")
  )  + geom_point(colour = "black")
  output_fig(path=output_path, name = "fig8a_bw", format = format, width = w/3, height = w/3)
  dev.off()
  
  # Figure 8b -----
  output$figures$is_post  + theme(
    text = element_text(size = 12), 
    plot.title = element_text(size = 10, vjust = -2), 
    plot.subtitle = element_text(size = 8, vjust = 0),
    plot.margin = unit(c(0,0,0,0), "cm")
  ) 
  output_fig(path=output_path, name = "fig8b", format = format, width = w/3, height = w/3)
  dev.off()
  
  output$figures$is_post  + theme(
    text = element_text(size = 12), 
    plot.title = element_text(size = 10, vjust = -2), 
    plot.subtitle = element_text(size = 8, vjust = 0),
    plot.margin = unit(c(0,0,0,0), "cm")
  ) + geom_point(colour = "black")
  output_fig(path=output_path, name = "fig8b_bw", format = format, width = w/3, height = w/3)
  dev.off()
  
  # Figure 8c -----
  output$figures$is_oos  + theme(
    text = element_text(size = 12), 
    plot.title = element_text(size = 10, vjust = -2), 
    plot.subtitle = element_text(size = 8, vjust = 0),
    plot.margin = unit(c(0,0,0,0), "cm")
  ) 
  output_fig(path=output_path, name = "fig8c", format = format, width = w/3, height = w/3)
  dev.off()
  
  output$figures$is_oos  + theme(
    text = element_text(size = 12), 
    plot.title = element_text(size = 10, vjust = -2), 
    plot.subtitle = element_text(size = 8, vjust = 0),
    plot.margin = unit(c(0,0,0,0), "cm")
  )  + geom_point(colour = "black")
  output_fig(path=output_path, name = "fig8c_bw", format = format, width = w/3, height = w/3)
  dev.off()
  
  # Figure 9 -----
  output$figures$overtime
  output_fig(path=output_path, name = "fig9", format = format, width = w, height = h)
  dev.off()
  
  output$figures$overtime_bw
  output_fig(path=output_path, name = "fig9_bw", format = format, width = w, height = h)
  dev.off()
  
  
  # Figure 10 -----
  output$figures$sim_harvey + theme(
    text = element_text(size = 12),
    axis.text.x = element_text(size = 11),
    axis.text.y = element_text(size = 11),
    legend.text = element_text(size = 11)
  )
  output_fig(path=output_path, name = "fig10", format = format, width = w, height = h)
  dev.off()
  
  output$figures$sim_harvey + theme(
    text = element_text(size = 12),
    axis.text.x = element_text(size = 11),
    axis.text.y = element_text(size = 11),
    legend.text = element_text(size = 11)
  ) + scale_colour_manual(values = rep("black", 3))
  output_fig(path=output_path, name = "fig10_bw", format = format, width = w, height = h)
  dev.off()
  
  # Figure 11 -----
  output$figures$gl_by_factor + theme(text = element_text(size = 13), legend.position = "right", axis.text.x = element_text(size = 5)) 
  output_fig(path=output_path, name = "fig11", format = format, width = w, height = h)
  dev.off()
  
  output$figures$gl_by_factor + theme(text = element_text(size = 13), legend.position = "right", axis.text.x = element_text(size = 5)) +
    scale_colour_manual(values = rep("black", 13))
  output_fig(path=output_path, name = "fig11_bw", format = format, width = w, height = h)
  dev.off()
  
  
  # Figure 12 -----
  output$figures$effect_size + theme(
    axis.text.y = element_text(size = 13), 
    strip.text.x = element_text(size = 14),
    axis.text.x = element_text(size = 12),
    axis.title.x = element_text(size = 13)
  )
  output_fig(path=output_path, name = "fig12a", format = format, width = w, height = h)
  dev.off()
  
  output$figures$effect_size + theme(
    axis.text.y = element_text(size = 13), 
    strip.text.x = element_text(size = 14),
    axis.text.x = element_text(size = 12),
    axis.title.x = element_text(size = 13)
  ) + scale_fill_manual(values = rep("grey35", 13))
  output_fig(path=output_path, name = "fig12a_bw", format = format, width = w, height = h)
  dev.off()
  
  # Figure 12b -----
  output$figures$effect_regions + theme(
    axis.text.y = element_text(size = 13), 
    strip.text.x = element_text(size = 13),
    axis.text.x = element_text(size = 12),
    axis.title.x = element_text(size = 13)
  )
  output_fig(path=output_path, name = "fig12b", format = format, width = w, height = h)
  dev.off()
  
  output$figures$effect_regions + theme(
    axis.text.y = element_text(size = 13), 
    strip.text.x = element_text(size = 13),
    axis.text.x = element_text(size = 12),
    axis.title.x = element_text(size = 13)
  ) + scale_fill_manual(values = rep("grey35", 13))
  output_fig(path=output_path, name = "fig12b_bw", format = format, width = w, height = h)
  dev.off()
  
  # Figure 13 -----
  output$figures$tpf + theme(text = element_text(size = 13), axis.text.x = element_text(size = 13), axis.text.y = element_text(size = 13))
  output_fig(path=output_path, name = "fig13", format = format, width = w, height = h)
  dev.off()
  
  output$figures$tpf + theme(text = element_text(size = 13), axis.text.x = element_text(size = 13), axis.text.y = element_text(size = 13)) +
    scale_fill_manual(values = rep("grey35", 14))
  output_fig(path=output_path, name = "fig13_bw", format = format, width = w, height = h)
  dev.off()
  
  # Figure 14 -----
  # Evolution of TPF
  output$figures$tpf_evolution
  output_fig(path=output_path, name = "fig14", format = format, width = w, height = h*0.8)
  dev.off()
  
  output$figures$tpf_evolution
  output_fig(path=output_path, name = "fig14_bw", format = format, width = w, height = h*0.8)
  dev.off()
  
  
  # Figure IA.1 ----- # NEED TO DO!!
  if (FALSE) {
    # Need to run everything with settings$weighting$us = "vw" and settings$weighting$global_ex_us="vw"
    output$figures$lit_comp
    output_fig(path=output_path, name = "figIA1", format = format, width = w + 1, height = h)
    dev.off()
  }
  
  
  # Figure IA.2 ------
  output$figures$eb_sig_oos
  output_fig(path=output_path, name = "figIA2", format = format, width = w, height = h*2/3)
  dev.off()
  
  # Figure IA.3a -----
  output$figures$is_pre_quad + theme(
    text = element_text(size = 12), 
    plot.title = element_text(size = 10, vjust = -2), 
    plot.subtitle = element_text(size = 8, vjust = 0),
    plot.margin = unit(c(0,0,0,0), "cm")
  ) 
  output_fig(path=output_path, name = "figIA3a", format = format, width = w/3, height = w/3)
  dev.off()
  
  # Figure IA.3b -----
  output$figures$is_post_quad  + theme(
    text = element_text(size = 12), 
    plot.title = element_text(size = 10, vjust = -2), 
    plot.subtitle = element_text(size = 8, vjust = 0),
    plot.margin = unit(c(0,0,0,0), "cm")
  )  
  output_fig(path=output_path, name = "figIA3b", format = format, width = w/3, height = w/3)
  dev.off()
  
  # Figure IA.3c -----
  output$figures$is_oos_quad  + theme(
    text = element_text(size = 12), 
    plot.title = element_text(size = 10, vjust = -2), 
    plot.subtitle = element_text(size = 8, vjust = 0),
    plot.margin = unit(c(0,0,0,0), "cm")
  ) 
  output_fig(path=output_path, name = "figIA3c", format = format, width = w/3, height = w/3)
  dev.off()
  
  
  # Figure IA.4 -----
  output$figures$overtime_flex + theme(text = element_text(size = 13))
  output_fig(path=output_path, name = "figIA4", format = format, width = w, height = h)
  dev.off()
  
  # Figure IA.5 -----
  output$figures$overtime_flex_taus + theme(text = element_text(size = 13), legend.text = element_text(size = 12))
  output_fig(path=output_path, name = "figIA5", format = format, width = w, height = h)
  dev.off()
  
  # Figure IA.6 ----
  output$figures$sim_harvey_robustness + theme(
    text = element_text(size = 12),
    axis.text.x = element_text(size = 11),
    axis.text.y = element_text(size = 11),
    legend.text = element_text(size = 11)
  )
  output_fig(path=output_path, name = "figIA6", format = format, width = w, height = h)
  dev.off()
  
  # Figure IA.7 ----
  output$figures$gl_by_cluster  + theme(text = element_text(size = 13))
  output_fig(path=output_path, name = "figIA7", format = format, width = w, height = h)
  dev.off()
  
  # Figure IA.8 ----
  output$figures$size_clusters
  output_fig(path=output_path, name = "figIA8", format = format, width = w, height = h+1)
  dev.off()
  
  # Figure IA.9 ----
  output$figures$tpf_regions  + theme(text = element_text(size = 13), axis.text.x = element_text(size = 13), axis.text.y = element_text(size = 13))
  output_fig(path=output_path, name = "figIA9", format = format, width = w, height = h)
  dev.off()
  
  # Figure IA.10 ----
  output$figures$tpf_size  + theme(
    text = element_text(size = 13), 
    axis.text.x = element_text(size = 13), 
    axis.text.y = element_text(size = 11))
  
  output_fig(path=output_path, name = "figIA10", format = format, width = w, height = h*1.5)
  dev.off()
  
  # Figure IA.11 ----
  output$figures$tpf_factors_one_cluster
  output_fig(path=output_path, name = "figIA11", format = format, width = w, height = h)
  dev.off()
  
  # Figure IA.12 ----
  output$figures$tpf_factors_excl_one
  output_fig(path=output_path, name = "figIA12", format = format, width = w, height = h)
  dev.off()
  
  # Figure IA.13 ----
  output$figures$tpf_factors_imp_cluster + theme(
    axis.text.y = element_text(size = 6)
  )
  output_fig(path=output_path, name = "figIA13", format = format, width = w, height = h*1.5)
  dev.off()
  
  # Figure IA.14 ----
  output$figures$tpf_factors_imp + theme(
    axis.text.y = element_text(size = 6)
  )
  output_fig(path=output_path, name = "figIA14", format = format, width = w, height = h*1.5)
  dev.off()
  
  # Figure IA.15 (special, because not ggplot) ----
  pdf(str_c(output_path, "/figIA15.pdf"), width = w, height = h*2+1.5)
  output$figures$hcl_us()
  dev.off()
  
  # Figure IA.16 (special, because not ggplot) ----
  pdf(str_c(output_path, "/figIA16.pdf"), width = 6, height = 6)
  output$figures$hcl_us_val()
  dev.off()
  
  
  
  
  # Other figures not included in paper --------
  output$figures$ci_many_fcts + theme(axis.text.y = element_text(size = 13), text = element_text(size = 13))
  output$figures$model_fdr + theme(text = element_text(size = 13))

  # TABLES -----------------
  # Estimated Taus
  table_taus()
  # Economic Benefit of more Power
  sig_oos_pfs %>% table_economic_benefit()
  # Factor Performance
  table_factor_info()
  
  # Numbers mentioned in paper --------
  # Bayesian Multiple Testing
  bayes_sim <- 1000000
  (fdr_196 <- fdr_fwer_rates(t_cutoff = 1.96, orig_sig = T, a_vec = eb_est$world$factor_mean, a_cov = eb_est$world$factor_cov, n_sim = bayes_sim, seed=settings$seed))
  (fdr_278 <- fdr_fwer_rates(t_cutoff = 2.78, orig_sig = T, a_vec = eb_est$world$factor_mean, a_cov = eb_est$world$factor_cov, n_sim = bayes_sim, seed=settings$seed))
  (true_factors_tbl <- true_factors(t_cutoff = 0, a_vec = eb_est$world$factor_mean, a_cov = eb_est$world$factor_cov, orig_sig = T, n_sim = bayes_sim, seed=settings$seed))
  (rr_unc <- true_factors(t_cutoff = 1.96, a_vec = eb_est$world$factor_mean, a_cov = eb_est$world$factor_cov, orig_sig = T, n_sim = bayes_sim, seed=settings$seed))
  
  # Mentioned in introduction
  paste0("Replication rate SE: ", round(rr_unc$sd*100, 2), "%")
  paste0("Bayesian FDR: ", round(fdr_196$fdr_dist$mean*100, 2), "%, with 95% CI of [", round(fdr_196$fdr_dist$p025*100, 2), "%, ", round(fdr_196$fdr_dist$p975*100, 2), "%], SE: ", round(fdr_196$fdr_dist$sd*100, 2))
  paste0("Bayesian FWER: ", round(fdr_196$fwer_dist$mean*100, 2), "%, with SE of ", round(fdr_196$fwer_dist$sd*100, 2), "%")
  paste0("Expected fraction of true factors: ", round(true_factors_tbl$mean*100, 2), "%, with SE of ", round(true_factors_tbl$sd*100, 2), "%")
  
  
  # BY cutoff
  mt %>%
    filter(method == "BY" & region == "us") %>%
    mutate(sig = p<=0.05) %>%
    group_by(sig) %>%
    mutate(
      max = max(abs(statistic)),
      min = min(abs(statistic))
    ) %>%
    filter(abs(statistic)==max & sig==F | abs(statistic)==min & sig==T) %>%
    ungroup() %>%
    summarise(
      by_cutoff = mean(abs(statistic))
    ) %>% 
    print()
  
  # Change in Book equity factor
  be_gr_us <- eb_est$us$factors %>% 
    filter(region == "us" & characteristic == "be_gr1a")
  be_gr_all <- eb_est$all$factors %>% 
    filter(region == "us" & characteristic == "be_gr1a")
  tibble(
    characteristic = rep("be_gr1a",2), 
    region = rep("US", 2), 
    data = c("US", "Global"), 
    post_mean = c(be_gr_us$post_mean, be_gr_all$post_mean), 
    post_vol = c(be_gr_us$post_sd, be_gr_all$post_sd),
    t = post_mean / post_vol
  ) %>% print()
  
  # IS / OOS 
  is_oos$post$regs %>%
    ungroup() %>%
    summarise(
      is = mean(is),
      post = mean(oos),
      decline = post/is-1
    ) %>% 
    print()
  
  c("pre","post","pre_post") %>% lapply(function(x) {
    is_oos[[x]]$regs %>% mutate(period = x)
  }) %>%
    bind_rows() %>%
    group_by(period) %>%
    summarise(
      n = n(),
      is = mean(is > 0),
      oos = mean(oos > 0)
    ) %>% 
    print()
  
  # Posterior over time width
  posterior_over_time %>% plot_over_time(orig_sig = T, ols_incl = T, lb = 5)
  
  # Bayesian Multiple Testing
  fdr_196$fdr_dist
  fdr_278$fwer_fdr # FWER at t>2.78
  true_factors_tbl
  
  # Replication rates in different size groups
  eb_us_size %>% plot_size_overall(flipped = T, text = T)
  
  # Publication Bias
  plot_harvey(harvey_base_res = harvey_base_res, harvey_worst_res = harvey_worst_res, tau_ws = 0.21, act_rr = headline_rr)
  
  # Correlations across size and region
  eb_us_size %>%
    select(characteristic, size_grp, ols_est) %>%
    spread(key = size_grp, value = ols_est) %>%
    summarise(
      cor_mega_micro = cor(Mega, Micro, method = "spearman"),
      cor_mega_nano = cor(Mega, Nano, method = "spearman")
    ) %>%
    print()
  
  eb_est$all$factors %>%
    select(characteristic, region, ols_est) %>%
    spread(key = region, value = ols_est) %>%
    na.omit() %>%
    summarise(
      n = n(),
      cor_us_dev = cor(us, developed, method = "spearman"),
      cor_us_emer = cor(us, emerging, method = "spearman")
    ) %>%
    print()
  
  # TPF Evolution numbers
  tpf_evol$data %>%
    arrange(year) %>%
    mutate(tpf_sr_l1 = dplyr::lag(tpf_sr)) %>%
    filter(year %in% c(min(year), max(year), 2002, 1991)) %>% # char_info[characteristic %in% c("seas_2_5an", "oaccruals_at")]
    arrange(year) %>%
    mutate(
      new_inclusions = case_when(
        year == 1971 ~ "Market",
        year == 1991 ~ "Accruals",
        year == 2002 ~ "Seasonality",
        year == year(settings$end_date) ~ "[All factors included]",
      )
    ) %>%
    print()
  
  # Average pairwise correlations
  eb_est$us$input$long %>%
    select(characteristic, eom, ret_neu) %>%
    spread(key = characteristic, value = ret_neu) %>%
    select(-eom) %>%
    cor(use = "pairwise.complete.obs") %>%
    as_tibble(rownames = "char1") %>%
    gather(-char1, key = "char2", value = "cor") %>% 
    filter(char1 != char2) %>% 
    summarise(average_cor = mean(cor))
  
}

================================================
FILE: Analysis/Analysis.Rproj
================================================
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX


================================================
FILE: Analysis/README.md
================================================
## Overview
This repository contains code that is used for the paper [Is There a Replication Crisis in Finance?](https://onlinelibrary.wiley.com/doi/full/10.1111/jofi.13249) by Jensen, Kelly and Pedersen (2023). The code used to create the underlying dataset can be found at [https://github.com/bkelly-lab/GlobalFactor](https://github.com/bkelly-lab/GlobalFactor). Please cite Jensen, Kelly and Pedersen (2023) if using the code or data. 

## How To Run the Code

1. Start with factor returns in each country. The files you need to download are `hml.csv`, `cmp.csv` and `market_returns.csv` using one of the following methods.
	1. Download the factor returns used in the paper [here](https://www.dropbox.com/sh/wcrjok1qyxtrasi/AABZ90GDCUvIzDzijt8Qoo3ha?dl=0). 
	1. Download the latest version of the factor returns [here](https://www.dropbox.com/sh/xq278bryrj0qf9s/AABUTvTGok91kakyL07LKyQoa?dl=0). 
	1. Generate the factor returns from scratch by following the steps in [GlobalFactors](https://github.com/bkelly-lab/ReplicationCrisis/tree/master/GlobalFactors) as the output from `portfolios.R`. 
2. Copy the code from this repository to a local folder. 
3. Open `main.R` in the programming language "R".
4. Ensure that the current working directory is the folder from 2. To check this, write `getwd()` in the console. To change the working directory use `setwd()`.
5. Run `main.R`.

## Outputs
1. The consol prints key numbers used in the paper as well as the paper tables in latex format.
2. If `save_figures=TRUE` (default), the folder in `output_path` will contain figures of the same format used in the paper. 

## Optional Settings 
1. `data_path` is the folder with the portfolio data from step 1 (default: current working directory/Data).
2. `object_path` is a folder where R objects can be saved for faster                iterations (default: current working directory/Objects).
3. `output_path` is a folder where figures can be saved (default: current working directory/Figures).
4. `save_figures` should be `TRUE` if you wish to save figures, otherwise `FALSE` (default: TRUE).
5. `settings` controls settings for the analysis, including the start and end date,           the portfolio weighting scheme, the cluster settings, the empirical Bayes settings, and the             tangency portfolio settings (default: the settings in the paper).  

## Notes

The code is divded into 4 separate R scripts. `0 - Functions.R` contains the project functions, `1 - Prepare Data.R` prepares the data, `2 - Determine Clusters.R` finds statistical clusters based on return data, `3 - Analysis.R` analyzes the data and `4 - Output.R` generates tables and figures based on the analysis. Everything is sourced from `main.R` which also contains user-defined control variables.  


================================================
FILE: Analysis/country_stats.R
================================================
library(xtable)
library(tidyverse)
library(data.table)

# Data [output from SAS code, already screened with obs_main=1, primary_sec=1, exch_main=1]
data_path <- "../../Data/Characteristics"
country_files <- list.files(data_path)
countries <- country_files %>% lapply(function(file) {
  fread(paste0(data_path, "/", file), select = c("excntry", "id", "eom", "me", "size_grp", "ret_local"))
}) %>% rbindlist()
countries[, eom := eom %>% as.character() %>% lubridate::fast_strptime(format = "%Y%m%d") %>% as.Date()]
# Aggregate by month
country_info <- countries[!is.na(me) & !is.na(ret_local), .(
  n = .N,
  n_nano = sum(size_grp == "nano"),
  n_mega = sum(size_grp == "mega"),
  me = sum(me),
  me_p50 = median(me)
), by = .(excntry, eom)]
# Country Classification
country_classification <- readxl::read_xlsx("Country Classification.xlsx", 
                                            sheet = "countries", range = "A1:C200") %>%
  select(excntry, msci_development) %>%
  filter(!is.na(excntry)) %>%
  setDT()
country_info <- country_classification[country_info, on = "excntry"]

# Table
table_country <- function(country_info, info_date) {
  tbl_caption <- paste("The table shows summary statistics by the country where a security is listed.\\", 
                       "We include common stocks that are the primary security of the underlying firm, traded on a standard exchange, with non-missing return and market equity data.\\",
                       "\\textit{Country} is the ISO code of the underlying exchange country.\\", 
                       "For further information, see \\href{https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes}{https://en.wikipedia.org/wiki/List\\_of\\_ISO\\_3166\\_country\\_codes.}",
                       "\\textit{MSCI} shows the MSCI classification of each country as of January 7th 2021.", 
                       "For the most recent classification, see \\href{https://www.msci.com/market-classification}{https://www.msci.com/market-classification}.",
                       "\\textit{Start} is the first date with a valid observation.",
                       "In the next 4 columns, the data is shown as of December 31st 2020.\\",
                       "\\textit{Stocks} is the number of stocks available.\\", 
                       "\\textit{Mega stocks} is the number of stocks with a market cap above the 80th percentile of NYSE stocks.\\",
                       "\\textit{Total Market Cap} is the aggregate market cap in million USD.\\",
                       "\\textit{Median MC} is the median market cap in million USD.")
  
  # Country summary
  country_info[, start_date := min(eom), by = excntry]
  country_stats <- country_info[eom==info_date]
  countries_add <- country_info[!(excntry %in% country_stats$excntry)][, .(excntry, msci_development, eom = info_date, start_date, n=0, n_mega=0, n_nano=0, me=0, me_p50=0)] %>% unique()# Add countries without data by info_date
  country_stats <- country_stats %>% rbind(countries_add) %>% arrange(-me) %>% select(-c(n_nano, eom))
  
  total <- country_stats %>%
    ungroup() %>%
    summarise(
      excntry = "All",
      msci_development = "",
      start_date = NA,
      n = sum(n),
      n_mega = sum(n_mega),
      me = sum(me),
      me_p50 = NA_real_
    )
  
  country_stats %>%
    bind_rows(total) %>%
    mutate(
      n = n %>% prettyNum(big.mark = ",", digits = 0),
      n_mega = n_mega %>% prettyNum(big.mark=",", digits = 0),
      start_date = as.character(start_date),
      me = me %>% formatC(format = "e", digits = 2),
      me_p50 = me_p50 %>% prettyNum(big.mark=",", digits = 0),
      msci_development = msci_development %>% str_to_title(),
      " "= ' ',
    ) %>%
    select(excntry, msci_development, start_date, ` `, everything()) %>%
    rename("Country" = excntry, "MSCI" = msci_development, "Start" = start_date, "Stocks" = n, "Mega Stocks" = n_mega,
           "Total Market Cap" = me, "Median MC" = me_p50) %>%
    xtable(digits = 0, align = "lllllrrrr", caption = tbl_caption) %>%
    print(include.rownames = T, floating = FALSE, latex.environments = "center", hline.after=c(-1, 0), 
          tabular.environment = "longtable", size="\\fontsize{10pt}{12pt}\\selectfont")
}

# Output for paper--------------------
# Remember:
# Only copy from "& country ..." and down.
# In line 94: 
#  - Delete "94"
#  - Delete "NA"
#  - Add hline above and below total
#  - Make "All" in \textbf{}
country_info[!(excntry %in% c("ZWE", "VEN"))] %>% table_country(info_date = as.Date("2020-12-31"))  # We exclude Zimbamwe and Venesuela due to data issues

# Nano Caps in the US
country_info[eom == as.Date("2020-12-31") & excntry == "USA", .(n, n_nano, nano_prop = n_nano / n)] %>% print()


================================================
FILE: Analysis/hxz_decomp.R
================================================
library(lubridate)
library(tidyverse)
library(data.table)
options(dplyr.summarise.inform = FALSE)

# User Input -----------------------
# Paths
data_path <- "../../Data" # Set to path with global characteristics data.
# Start Date
start <- as.Date("2020-12-31")

# Data -----------------------------
# Characteristics
char_info <- readxl::read_xlsx("Factor Details.xlsx",
                               sheet = "details", range = "A1:N300") %>%
  filter(!is.na(abr_jkp)) %>%
  select("characteristic"=abr_jkp, direction, significance) %>%
  mutate(direction = direction %>% as.numeric()) %>%
  setDT()
chars <- char_info$characteristic

# NYSE Cutoff 
nyse_size_cutoffs <- fread(paste0(data_path, "/nyse_cutoffs.csv"), colClasses = c("eom"="character"))
nyse_size_cutoffs[, eom := as.Date(eom, format = "%Y%m%d")]

# Return Cutoffs
ret_cutoffs <- fread(paste0(data_path, "/return_cutoffs.csv"), colClasses = c("eom"="character"))
ret_cutoffs[, eom := as.Date(eom, format = "%Y%m%d")]
ret_cutoffs[, eom_lag1 := floor_date(eom, unit = "month") - 1]  # Because we use ret_exc_lead1m

# Data
data <- fread(paste0(data_path, "/Characteristics/usa.csv"), 
              select = c("excntry", "id", "eom", "source_crsp", "comp_exchg", "crsp_exchcd", "size_grp", "ret_exc", "ret_exc_lead1m", "me", chars), colClasses = c("eom"="character"))
data[, eom := as.Date(lubridate::fast_strptime(eom, format = "%Y%m%d"))]
# ME CAP
data <- nyse_size_cutoffs[, .(eom, nyse_p80)][data, on = "eom"]
data[, me_cap := pmin(me, nyse_p80)][, nyse_p80 := NULL]
# Screens
data <- data[!is.na(size_grp) & !is.na(me) & !is.na(ret_exc_lead1m)]

# Winsorize Compustat Returns
data <- ret_cutoffs[, .(eom, "p001"=ret_exc_0_1, "p999"=ret_exc_99_9)][data, on = "eom"]
data[source_crsp == 0 & ret_exc > p999, ret_exc := p999]
data[source_crsp == 0 & ret_exc < p001, ret_exc := p001]
data[, c("p001", "p999") := NULL]
data <- ret_cutoffs[, .("eom" = eom_lag1, "p001"=ret_exc_0_1, "p999"=ret_exc_99_9)][data, on = "eom"]
data[source_crsp == 0 & ret_exc_lead1m > p999, ret_exc_lead1m := p999]
data[source_crsp == 0 & ret_exc_lead1m < p001, ret_exc_lead1m := p001]
data[, c("source_crsp", "p001", "p999") := NULL]

# Create 1 month separated observations
returns <- tidyr::crossing("id" = unique(data$id), "eom" = unique(data$eom)) %>% setDT()
returns <- data[, .(id, eom, ret_exc)][returns, on = .(id, eom)]
returns[, start := min(eom[!is.na(ret_exc)]), by = id]
returns <- returns[eom >= start][, start := NULL]
returns[, last := floor_date(max(eom[!is.na(ret_exc)]), unit = "month") + months(12) - 1, by = id]  # To avoid lookahead bias, _horizon_ months after last obs
returns <- returns[eom <= last][, last := NULL]
returns %>% setorder(id, eom)

pf_func <- function(chars, pfs, bps, bp_min_n, min_stocks, horizon) {
  # Realized Returns 
  ret_lead <- 1:horizon %>% lapply(function(h) {
    if (h==1) {
      r <- data[, .(id, eom, lead = 1, ret_exc = ret_exc_lead1m)][!is.na(ret_exc)]
    } else {
      r <- returns[, .(eom, lead = h, ret_exc = dplyr::lead(ret_exc, n = h)), by = id][!is.na(ret_exc)]
    }
    r[, eom_ret := ceiling_date(eom, unit = "months")+months(h)-1]
  }) %>% rbindlist()
  # Portfolios 
  chars %>% lapply(function(x) {
    print(paste0("   " , x, ": ", match(x, chars), " out of ", length(chars)))
    data[, var := as.double(get(x))]
    sub <- data[!is.na(var), .(id, eom, var, size_grp, me, me_cap, crsp_exchcd, comp_exchg)]
    # Portfolio Assignment
    if (bps == "nyse") {
      sub[, bp_stock := (crsp_exchcd == 1 & is.na(comp_exchg)) | (comp_exchg == 11 & is.na(crsp_exchcd))]
    }
    if (bps == "non_mc") {
      sub[, bp_stock := (size_grp %in% c("mega", "large", "small"))]
    }
    sub[, bp_n := sum(bp_stock), by = eom]
    sub <- sub[bp_n >= bp_min_n] # require at least 10 stocks for break points
    sub[, cdf := ecdf(var[bp_stock == T])(var), by = eom]
    sub[, min_cdf := min(cdf), by = eom]
    sub[cdf == min_cdf, cdf := 0.00000001] # To ensure that the lowest value is in portfolio 1   
    sub[, pf := ceiling(cdf*pfs), by = eom]  
    sub[pf == 0, pf := 1]  # Happens when non-bp stocks extend beyond bp stock range
    # Align with returns
    sub <- sub[, .(id, eom, me, me_cap, pf)][ret_lead, on = .(id, eom)][!is.na(pf)]
    # Returns 
    pf_returns <- sub[, .(
      characteristic = x,
      n = .N,
      ret_ew = mean(ret_exc),
      ret_vw = sum(ret_exc * me) / sum(me),
      ret_vw_cap = sum(ret_exc * me_cap) / sum(me_cap)
    ), by = .(pf, eom_ret, lead)]
    # HML
    pf_returns[, .(
      characteristic = x,
      n_stocks_min = as.integer(min(n[pf==pfs], n[pf==1])),
      ret_ew = ret_ew[pf == pfs] - ret_ew[pf == 1],
      ret_vw = ret_vw[pf == pfs] - ret_vw[pf == 1],
      ret_vw_cap = ret_vw_cap[pf == pfs] - ret_vw_cap[pf == 1]
    ), by = .(eom_ret, lead)][!is.na(ret_ew) & n_stocks_min >= min_stocks]
  }) %>% rbindlist()
}

# Output 
system.time(hml_nonmc3 <- chars %>% pf_func(pfs = 3, bps = "non_mc", bp_min_n = 5, min_stocks = 5, horizon = 12))  # 47 min
system.time(hml_nyse10 <- chars %>% pf_func(pfs = 10, bps = "nyse", bp_min_n = 5, min_stocks = 5, horizon = 12))

rr <- list(hml_nonmc3, hml_nyse10) %>% lapply(function(dt) {
  repl_data <- char_info[dt, on = "characteristic"] %>%
    filter(eom_ret <= start) %>%
    mutate(
      ret_ew = ret_ew*direction,
      ret_vw = ret_vw*direction,
      ret_vw_cap = ret_vw_cap*direction
    ) %>%
    pivot_longer(c(ret_ew, ret_vw, ret_vw_cap), names_to = "type", values_to = "ret")
  
  repl_data <- c(1, 6, 12) %>% lapply(function(h) {
    repl_data %>%
      filter(lead %in% 1:h) %>%
      group_by(characteristic, eom_ret, type, significance) %>%
      filter(n() == h) %>%
      summarise(ret = mean(ret)) %>%
      ungroup() %>%
      mutate(horizon = h)
  }) %>% bind_rows()
  
  rr_func <- function(sample) {
    sample %>%
      group_by(characteristic, type, horizon) %>%
      summarise(
        t = mean(ret)/(sd(ret)/sqrt(n()))
      ) %>%
      group_by(type) %>%
      summarise(
        n = n(),
        rr = mean(t > 1.96)
      )
  }
  
  new_factors <- c(
    "ret_3_1", "ret_9_1",  "ret_12_7",  "corr_1260d",  "rmax5_21d", "rmax5_rvol_21d",
    "ni_be", "ocf_at", "ocf_at_chg1", "mispricing_perf", "mispricing_mgmt",  "qmj",
    "qmj_prof", "qmj_growth",  "qmj_safety")
  
  
  # Our Benchline Raw Return
  baseline <- repl_data %>%
    filter(horizon == 1 & eom_ret <= start) %>%
    rr_func() %>%
    mutate(name = "Baseline")
  # Difference in sample period
  hor_diff <- repl_data %>%
    rr_func() %>%
    mutate(name = "Three Horizons")
  # Difference in horizons
  sample_diff <- repl_data %>%
    filter(eom_ret >= as.Date("1967-01-01") & eom_ret <= as.Date("2016-12-31")) %>%
    rr_func() %>%
    mutate(name = "Three Horizons, Shorter Sample")
  # Differences in Factors
  fct_diff <- repl_data %>%
    filter(eom_ret >= as.Date("1967-01-01") & eom_ret <= as.Date("2016-12-31")) %>%
    filter(!(characteristic %in% new_factors)) %>%
    rr_func() %>%
    mutate(name = "Three Horizons, Shorter Sample, Difference in Factors")
  bind_rows(baseline, sample_diff, hor_diff, fct_diff) %>%
    arrange(type, name)
})

# Decomposition  
terc_base <- rr[[1]] %>% filter(name == "Baseline" & type == "ret_vw_cap") %>% pull(rr)
terc_base_vw <- rr[[1]] %>% filter(name == "Baseline" & type == "ret_vw") %>% pull(rr)
terc_hor <- rr[[1]] %>% filter(name == "Three Horizons" & type == "ret_vw") %>% pull(rr)
terc_sample <- rr[[1]] %>% filter(name == "Three Horizons, Shorter Sample" & type == "ret_vw") %>% pull(rr)
terc_factors <- rr[[1]] %>% filter(name == "Three Horizons, Shorter Sample, Difference in Factors" & type == "ret_vw") %>% pull(rr)
dec_factors <- rr[[2]] %>% filter(name == "Three Horizons, Shorter Sample, Difference in Factors" & type == "ret_vw") %>% pull(rr)

# From vw_cap to vw
terc_base-terc_base_vw
# Multiple Horizons
terc_base_vw-terc_hor
# Shorter sample 
terc_hor-terc_sample
# New Factors
terc_sample-terc_factors
# Deciles instead of terciles and change of BP
terc_factors-dec_factors
# Explained Difference
(expl_rr <- terc_base-((terc_base-terc_base_vw)+(terc_base_vw-terc_hor)+(terc_hor-terc_sample)+(terc_sample-terc_factors)+(terc_factors-dec_factors)))
expl_rr-0.35


================================================
FILE: Analysis/main.R
================================================
library(cowplot)
library(directlabels)
library(xtable)
library(zeallot)
library(dendextend)
library(RColorBrewer)
library(rsample)
library(lubridate)
library(tidyverse)
library(data.table)
options(dplyr.summarise.inform = FALSE)

# How To --------------------------------------
# Paths
# - data_path:                Folder that contains market_returns.csv, hml.csv and cmp.csv generated from portfolio.R                
# - object_path:              Folder to save objects too. Retrived when update_*==F
# - output_path:              Folder to save figures in. Not neccesary if save_figures==F
# Save
# - save_figures:             Should figures be saved in output_path?
# Update
# - update_sim:               Simulations for figure 2 (Simulation Comparison of False Discovery Rates)
# - update_post_over_time:    Posterior calculations for figure 8 (US Factor Alpha Posterior Distribution over Time)
# - update_post_is:           Data for regression in table E.1 (The Economic Benefit of More Powerful Tests)
# - update_harvey_baseline:   Data for figure 9
# - update_harvey_worstcase:  Data for figure F.1

# User Input -----------------------
# Paths
data_path <- "Data" 
object_path <- "Objects"
output_path <- "Figures"
# Save
save_figures <- T
# Update
update_sim <- T              
update_post_over_time <- T
update_post_is <- T
update_harvey_baseline <- T  
update_harvey_worstcase <- T 
# Settings
settings <- list(
  seed = 1,
  start_date = as.Date("1925-12-31"),     
  end_date = as.Date("2022-12-31"),       # Important that end_date <= Last_CRSP_UPDATE
  country_excl = c("ZWE", "VEN"),         # Countries are excluded due to data issues
  weighting =  list(                      # Which weighting scheme to use? In c("ew", "vw", "vw_cap")
    us = 'vw_cap',
    global_ex_us = 'vw_cap'
  ),                   
  n_stocks_min = 5,                       # Minimum amount of stocks in each side of the portfoli
Download .txt
gitextract_789kjfg8/

├── .gitignore
├── Analysis/
│   ├── .gitignore
│   ├── 0 - Functions.R
│   ├── 1 - Prepare Data.R
│   ├── 2 - Determine Clusters.R
│   ├── 3 - Analysis.R
│   ├── 4 - Output.R
│   ├── Analysis.Rproj
│   ├── Country Classification.xlsx
│   ├── Factor Details.xlsx
│   ├── README.md
│   ├── country_stats.R
│   ├── hxz_decomp.R
│   └── main.R
├── GlobalFactors/
│   ├── CHANGELOG.md
│   ├── Cluster Labels.csv
│   ├── Country Classification.xlsx
│   ├── Factor Details.xlsx
│   ├── GlobalFactors.Rproj
│   ├── MD
│   ├── README.md
│   ├── accounting_chars.sas
│   ├── char_macros.sas
│   ├── ind_identification.sas
│   ├── main.sas
│   ├── market_chars.sas
│   ├── portfolios.R
│   └── project_macros.sas
└── README.md
Condensed preview — 29 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (469K chars).
[
  {
    "path": ".gitignore",
    "chars": 235,
    "preview": "GlobalFactors/.Rproj.user\nGlobalFactors/.Rhistory\nGlobalFactors/.RData\nGlobalFactors/.Ruserdata\nAnalysis/Data/\nAnalysis/"
  },
  {
    "path": "Analysis/.gitignore",
    "chars": 64,
    "preview": ".Rhistory\n.RData\n.Rproj.user\nData/\nObjects/\nFigures/\nScribbles/\n"
  },
  {
    "path": "Analysis/0 - Functions.R",
    "chars": 126957,
    "preview": "eb_prepare <- function(data, scale_alphas, overlapping) {\n  if (overlapping) {\n    data <- data %>%\n      group_by(regio"
  },
  {
    "path": "Analysis/1 - Prepare Data.R",
    "chars": 6605,
    "preview": "# Prepare Support Data ---------------------------------\n# Market Returns\nmarket_returns <- fread(paste0(data_path, \"/ma"
  },
  {
    "path": "Analysis/2 - Determine Clusters.R",
    "chars": 3272,
    "preview": "# Hierachical Clustering ----------------------------------------------\nfactor_hcl <- function(cor_mat, linkage = \"ward."
  },
  {
    "path": "Analysis/3 - Analysis.R",
    "chars": 16218,
    "preview": "# Empirical Bayes Estimation --------------------------\n# search_list: c(regions, type, layers, size_grp)\nsearch_list <-"
  },
  {
    "path": "Analysis/4 - Output.R",
    "chars": 27257,
    "preview": "# Determine Cluster Order\ncluster_order <- c(\"Accruals\", \"Debt Issuance\", \"Investment\", \"Short-Term Reversal\", \"Value\",\n"
  },
  {
    "path": "Analysis/Analysis.Rproj",
    "chars": 205,
    "preview": "Version: 1.0\n\nRestoreWorkspace: Default\nSaveWorkspace: Default\nAlwaysSaveHistory: Default\n\nEnableCodeIndexing: Yes\nUseSp"
  },
  {
    "path": "Analysis/README.md",
    "chars": 2774,
    "preview": "## Overview\nThis repository contains code that is used for the paper [Is There a Replication Crisis in Finance?](https:/"
  },
  {
    "path": "Analysis/country_stats.R",
    "chars": 4723,
    "preview": "library(xtable)\nlibrary(tidyverse)\nlibrary(data.table)\n\n# Data [output from SAS code, already screened with obs_main=1, "
  },
  {
    "path": "Analysis/hxz_decomp.R",
    "chars": 8226,
    "preview": "library(lubridate)\nlibrary(tidyverse)\nlibrary(data.table)\noptions(dplyr.summarise.inform = FALSE)\n\n# User Input --------"
  },
  {
    "path": "Analysis/main.R",
    "chars": 4709,
    "preview": "library(cowplot)\nlibrary(directlabels)\nlibrary(xtable)\nlibrary(zeallot)\nlibrary(dendextend)\nlibrary(RColorBrewer)\nlibrar"
  },
  {
    "path": "GlobalFactors/CHANGELOG.md",
    "chars": 9482,
    "preview": "# CHANGELOG.md\nThis change log keeps track of changes to the underlying data set. In brackets, we highlight versions of "
  },
  {
    "path": "GlobalFactors/Cluster Labels.csv",
    "chars": 3207,
    "preview": "characteristic,cluster\nage,Low Leverage\naliq_at,Investment\naliq_mat,Low Leverage\nami_126d,Size\nat_be,Low Leverage\nat_gr1"
  },
  {
    "path": "GlobalFactors/GlobalFactors.Rproj",
    "chars": 205,
    "preview": "Version: 1.0\n\nRestoreWorkspace: Default\nSaveWorkspace: Default\nAlwaysSaveHistory: Default\n\nEnableCodeIndexing: Yes\nUseSp"
  },
  {
    "path": "GlobalFactors/MD",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "GlobalFactors/README.md",
    "chars": 7957,
    "preview": "## Overview\nThis repository contains code that create a dataset of global stock returns and characteristics. The dataset"
  },
  {
    "path": "GlobalFactors/accounting_chars.sas",
    "chars": 48176,
    "preview": "***************************************************************************\n*                     Characteritics to Extr"
  },
  {
    "path": "GlobalFactors/char_macros.sas",
    "chars": 25078,
    "preview": "/* MACROS USING COMPOSITE DATA */\n/* MACRO: MISPRICING_FACTORS \n- Based on the paper by Yuan and Stambaugh (2016)\n- Curr"
  },
  {
    "path": "GlobalFactors/ind_identification.sas",
    "chars": 17263,
    "preview": "* MACRO: FF_IND_CLASS\n\tAdd variable matching 4-digit SIC identifiers to Fama-French industry identifiers\n\t   Arguments:\n"
  },
  {
    "path": "GlobalFactors/main.sas",
    "chars": 12629,
    "preview": "/* Clean working environment */\nproc delete data = _all_ ; run ; \n\n*****************************************************"
  },
  {
    "path": "GlobalFactors/market_chars.sas",
    "chars": 30337,
    "preview": "* Market Chars: Monthly;\n%let monthly_chars=\n\t/* Market Based Size Variables */\n\tmarket_equity\n\t\n\t/* Total Dividend Paid"
  },
  {
    "path": "GlobalFactors/portfolios.R",
    "chars": 30668,
    "preview": "library(lubridate)\nlibrary(tidyverse)\nlibrary(data.table)\n\n# How To --------------------\n# Paths\n# - data_path:    Set t"
  },
  {
    "path": "GlobalFactors/project_macros.sas",
    "chars": 52014,
    "preview": "*************************************************************\n*  HELPER MACROS\n*****************************************"
  },
  {
    "path": "README.md",
    "chars": 2142,
    "preview": "## Overview\nThis repository contains the code used for the paper [Is There a Replication Crisis in Finance?](https://onl"
  }
]

// ... and 4 more files (download for full content)

About this extraction

This page contains the full source code of the bkelly-lab/ReplicationCrisis GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 29 files (430.1 KB), approximately 142.7k tokens. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Copied to clipboard!