Repository: bkelly-lab/ReplicationCrisis Branch: master Commit: 2434da1281aa Files: 29 Total size: 430.1 KB Directory structure: gitextract_789kjfg8/ ├── .gitignore ├── Analysis/ │ ├── .gitignore │ ├── 0 - Functions.R │ ├── 1 - Prepare Data.R │ ├── 2 - Determine Clusters.R │ ├── 3 - Analysis.R │ ├── 4 - Output.R │ ├── Analysis.Rproj │ ├── Country Classification.xlsx │ ├── Factor Details.xlsx │ ├── README.md │ ├── country_stats.R │ ├── hxz_decomp.R │ └── main.R ├── GlobalFactors/ │ ├── CHANGELOG.md │ ├── Cluster Labels.csv │ ├── Country Classification.xlsx │ ├── Factor Details.xlsx │ ├── GlobalFactors.Rproj │ ├── MD │ ├── README.md │ ├── accounting_chars.sas │ ├── char_macros.sas │ ├── ind_identification.sas │ ├── main.sas │ ├── market_chars.sas │ ├── portfolios.R │ └── project_macros.sas └── README.md ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ GlobalFactors/.Rproj.user GlobalFactors/.Rhistory GlobalFactors/.RData GlobalFactors/.Ruserdata Analysis/Data/ Analysis/Objects/ Analysis/Figures/ Analysis/.Rproj.user Analysis/.Rhistory Analysis/.RData Analysis/.Ruserdata .Rproj.user ================================================ FILE: Analysis/.gitignore ================================================ .Rhistory .RData .Rproj.user Data/ Objects/ Figures/ Scribbles/ ================================================ FILE: Analysis/0 - Functions.R ================================================ eb_prepare <- function(data, scale_alphas, overlapping) { if (overlapping) { data <- data %>% group_by(region, characteristic) %>% mutate(obs = n()) %>% ungroup() %>% filter(obs == max(obs)) %>% select(-obs) } # Adjust for Beta data <- data %>% group_by(region, characteristic) %>% mutate( beta = cov(ret, mkt_vw_exc)/var(mkt_vw_exc), ret_neu = (ret - mkt_vw_exc * beta)*100, scaling_fct = sqrt(10^2/12) / sd(ret_neu), ret_neu_scaled = ret_neu * scaling_fct ) %>% ungroup() # Make Wide data <- data %>% mutate(name_wide = str_c(characteristic, "__", region)) if(scale_alphas) { data_wide <- data %>% select(name_wide, eom, ret_neu_scaled) %>% spread(key = name_wide, value = ret_neu_scaled) } else { data_wide <- data %>% select(name_wide, eom, ret_neu) %>% spread(key = name_wide, value = ret_neu) } # Return list( "long" = data, "wide" = data_wide ) } block_cluster_func <- function(cor_mat, cl_lables) { cor_long <- cor_mat %>% as_tibble(rownames = "char1") %>% gather(-char1, key = "char2", value = "cor") %>% separate(col = "char1", into = c("char1", "region1"), sep = "__") %>% separate(col = "char2", into = c("char2", "region2"), sep = "__") %>% left_join(cl_lables %>% select(characteristic, "hcl1" = hcl_label), by = c("char1"="characteristic")) %>% left_join(cl_lables %>% select(characteristic, "hcl2" = hcl_label), by = c("char2"="characteristic")) %>% rowwise() %>% mutate( hclreg1 = str_c(hcl1, "__", region1), hclreg2 = str_c(hcl2, "__", region2) ) %>% select(-hcl1, -hcl2) %>% mutate(hcl_pair = str_c(min(c(hclreg1, hclreg2)), "_x_", max(c(hclreg1, hclreg2)))) %>% unite(col = "name1", char1, region1, sep = "__", remove = T) %>% unite(col = "name2", char2, region2, sep = "__", remove = T) %>% ungroup() cluster_wise_cor <- cor_long %>% filter(name1 != name2) %>% # Exclude cor(factor_i, factor_i)=1 group_by(hcl_pair) %>% summarise( cor_avg = mean(cor) ) cluster_block_cor <- cor_long %>% left_join(cluster_wise_cor, by = "hcl_pair") %>% mutate(cor_avg = if_else(name1 == name2, 1, cor_avg)) %>% # ONLY IF SAME REGION!! select(name1, name2, cor_avg) %>% spread(key = name2, value = cor_avg) cbc_rows <- cluster_block_cor$name1 cluster_block_cor <- cluster_block_cor %>% select(-name1) %>% as.matrix() rownames(cluster_block_cor) <- cbc_rows return(cluster_block_cor) } # Empirical Bayes ---------------- emp_bayes <- function(data, cluster_labels, min_obs = 5 * 12, fix_alpha = F, bs_cov = F, cor_type = "sample", shrinkage = 0, layers = 3, bs_samples = 10000, seed, priors = NULL, sigma = NULL, plot = T) { # cor_type %in% c("raw", "block_2", "block_clusters") set.seed(seed) y_raw <- data$wide %>% select(-eom) %>% as.matrix() obs <- y_raw %>% apply(2, function(x) sum(!is.na(x))) y <- y_raw[, obs >= min_obs] n_fcts <- ncol(y) y_mean <- y %>% apply(2, mean, na.rm = T) if (is.null(sigma)) { if (bs_cov) { bs_full <- y %>% rsample::bootstraps(times = bs_samples) %>% mutate( res = splits %>% map(~.x %>% rsample::analysis() %>% apply(2, mean, na.rm = T) %>% as_tibble(rownames = "characteristic")) ) %>% select(-splits) %>% unnest(res) bs_full_cov <- bs_full %>% spread(key = characteristic, value = value) %>% select(-id) %>% cov() alpha_sd <- sqrt(diag(bs_full_cov)) alpha_cor <- solve(diag(alpha_sd)) %*% bs_full_cov %*% solve(diag(alpha_sd)) colnames(alpha_cor) <- names(alpha_sd) rownames(alpha_cor) <- names(alpha_sd) } else { y_sd <- y %>% apply(2, sd, na.rm=T) y_scor <- y %>% cor(use = "complete.obs") alpha_sd <- y_sd / sqrt(nrow(y)) alpha_cor <- y_scor } # Apply Shrinkage alpha_cor_shrunk <- alpha_cor * (1-shrinkage) + diag(n_fcts) * shrinkage # Correlation Block Adjustment if (cor_type == "sample") { alpha_cor_adj <- alpha_cor_shrunk } if (cor_type == "block_clusters") { alpha_cor_adj <- alpha_cor_shrunk %>% block_cluster_func(cl_lables = cluster_labels) } sigma <- diag(alpha_sd) %*% alpha_cor_adj %*% diag(alpha_sd) # This is really the equivalent of sigma/T from the paper colnames(sigma) <- colnames(alpha_cor_shrunk) print(str_c("Condition Number: Raw = ", round(kappa(alpha_cor_shrunk), 2), ", Adjusted = ", round(kappa(alpha_cor_adj), 2))) } else { alpha_sd <- sqrt(diag(sigma)) names(alpha_sd) <- colnames(sigma) } # Cluster Membership cm <- y_mean %>% as_tibble(rownames = "char_reg") %>% mutate( characteristic = str_split(char_reg, "__", simplify = T)[, 1] ) %>% left_join(cluster_labels, by = "characteristic") m <- cm %>% mutate(cm = 1) %>% select(char_reg, hcl_label, cm) %>% spread(key = hcl_label, value = cm) %>% select(-char_reg) %>% as.matrix() m[is.na(m)] <- 0 mm <- m %*% t(m) n_cl <- ncol(m) # Signal Membership z <- cm %>% mutate(sm = 1) %>% select(char_reg, characteristic, sm) %>% spread(key = characteristic, value = sm) %>% select(-char_reg) %>% as.matrix() z[is.na(z)] <- 0 zz <- z %*% t(z) n_s <- ncol(z) # Starting Values starting_values <- cm %>% group_by(hcl_label, characteristic) %>% summarise( n_s = n(), signal_mean = mean(value), signal_sd = sd(value) ) %>% group_by(hcl_label) %>% summarise( n_c = sum(n_s), cl_mean = mean(signal_mean), cl_sd = sd(signal_mean), cl_signal_within = mean(signal_sd) ) %>% ungroup() %>% mutate(cl_sd = if_else(n_c == 1, 0, cl_sd)) %>% summarise( alpha_mean = mean(cl_mean), sd_cl_mean = if_else(condition = fix_alpha, sqrt(sum((cl_mean^2) / (n() - 1))), sd(cl_mean)), sd_within_cl = mean(cl_sd), sd_within_signal = mean(cl_signal_within) ) if (fix_alpha) { sd_all <- sqrt(sum(y_mean^2) / (length(y_mean) - 1)) } else { sd_all <- sd(y_mean) } # Maximum Likelihood omega_func <- function(layers, tau_c, tau_s, tau_w) { if (layers == 1) { a_omega <- diag(n_fcts) * tau_c^2 # All alphas are drawn from same distribution } if (layers == 2) { a_omega <- diag(n_fcts) * tau_s^2 + mm * tau_c^2 # All cluster alphas are drawn from the same distribution, could be done with signals as well } if (layers == 3) { a_omega <- diag(n_fcts) * tau_w^2 + zz * tau_s^2 + mm * tau_c^2 # Cluster distrib., signal distrib. factor distrib. } return(a_omega) } # Choose between specifying prior parameters or finding them via EB if (is.null(priors)) { if (layers == 1) { start_list <- list( a = starting_values$alpha_mean, tc = sd_all) mle_func <- function(a, tc) { a_vec <- rep(a, n_fcts) a_omega <- omega_func(layers = layers, tau_c = tc, tau_s = NULL, tau_w = NULL) a_cov <- sigma + a_omega # / t_mat -(mvtnorm::dmvnorm(x = y_mean, mean = a_vec, sigma = a_cov, log = T)) # + dgamma(param[2], 2, 5, log = T)*sum(mm)/2 + dgamma(param[3], 2, 10, log = T)*140 } } if (layers == 2) { start_list <- list( a = starting_values$alpha_mean, tc = starting_values$sd_cl_mean, ts = starting_values$sd_within_cl) mle_func <- function(a, tc, ts) { a_vec <- rep(a, n_fcts) a_omega <- omega_func(layers = layers, tau_c = tc, tau_s = ts, tau_w = NULL) a_cov <- sigma + a_omega # / t_mat -(mvtnorm::dmvnorm(x = y_mean, mean = a_vec, sigma = a_cov, log = T)) } } if (layers == 3) { start_list <- list( a = starting_values$alpha_mean, tc = starting_values$sd_cl_mean, ts = starting_values$sd_within_cl, tw = starting_values$sd_within_signal) mle_func <- function(a, tc, ts, tw) { a_vec <- rep(a, n_fcts) a_omega <- omega_func(layers = layers, tau_c = tc, tau_s = ts, tau_w = tw) a_cov <- sigma + a_omega -(mvtnorm::dmvnorm(x = y_mean, mean = a_vec, sigma = a_cov, log = T)) } } # Maximum likelihood estimation for (k in 1:10) { initial_params <- start_list %>% lapply(function(x) max(x+rnorm(1, mean = 0, sd = 0.01), 0)) # Max is just to ensure that variances are not negative, never in use if (fix_alpha) { (hyper_pars <- stats4::mle(minuslogl = mle_func, start = initial_params, lower = c(-Inf, 0, 0, 0)[1:length(start_list)], fixed = list(a = 0))) } else { (hyper_pars <- stats4::mle(minuslogl = mle_func, start = initial_params, lower = c(-Inf, 0, 0, 0)[1:length(start_list)])) } if (hyper_pars@details$convergence==0) break } # Check convergence if (hyper_pars@details$convergence != 0) { warning("MLE step did not converge!!!") return(NULL) } mu <- hyper_pars@fullcoef["a"] tau_c <- hyper_pars@fullcoef["tc"] tau_s <- hyper_pars@fullcoef["ts"] tau_w <- hyper_pars@fullcoef["tw"] } else { mu <- priors$alpha tau_c <- priors$tau_c tau_s <- priors$tau_s tau_w <- priors$tau_w } theta <- omega_func(layers = layers, tau_c = tau_c, tau_s = tau_s, tau_w = tau_w) colnames(theta) <- rownames(theta) <- names(y_mean) print(paste("Condition Number Omega =", round(kappa(theta)))) # Signal Posteriors ------------------ if (layers == 3) { as_mean <- tau_w^2*t(z) %*% (theta + sigma) %*% (y_mean - rep(mu, n_fcts)) as_cov <- tau_w^2 * diag(n_s) - tau_w^4 * t(z) %*% (theta + sigma) %*% z as_sd <- sqrt(diag(as_cov)) colnames(as_mean) <- "post_mean" signal_summary <- as_mean %>% as_tibble(rownames = "characteristic") %>% left_join(as_sd %>% as_tibble(rownames = "characteristic") %>% rename("post_sd" = value), by = "characteristic") } # Factor Posteriors ------------------ ai_cov <- solve(solve(theta) + solve(sigma)) # t_mat * solve(sigma) ai_sd <- sqrt(diag(ai_cov)) ai_mean <- ai_cov %*% (solve(theta) %*% rep(mu, n_fcts) + solve(sigma) %*% y_mean) ## (t_mat * solve(sigma)) rownames(ai_mean) <- names(y_mean) colnames(ai_mean) <- "post_mean" names(ai_sd) <- names(y_mean) factor_summary <- ai_mean %>% as_tibble(rownames = "char_reg") %>% left_join(ai_sd %>% as_tibble(rownames = "char_reg") %>% rename("post_sd" = value), by = "char_reg") %>% left_join(y_mean %>% as_tibble(rownames = "char_reg") %>% rename("ols_est" = value), by = "char_reg") %>% left_join(alpha_sd %>% as_tibble(rownames = "char_reg") %>% rename("ols_se" = value), by = "char_reg") %>% mutate( characteristic = str_split(char_reg, "__", simplify = T)[, 1], # characteristic = char_reg %>% str_extract(".+[?=_{2}]") %>% str_remove("__") p025 = post_mean - 1.96 * post_sd, p975 = post_mean + 1.96 * post_sd ) %>% left_join(cluster_labels, by = "characteristic") %>% mutate( region = char_reg %>% str_extract(pattern = "(?<=_{2}).+") ) %>% select(char_reg, characteristic, hcl_label, region, everything()) # Output if (is.null(priors)) { comparison <- tibble( estimate = c("alpha", "tau_c", "tau_s", "tau_w")[1:(layers + 1)], crude = drop(unlist(start_list)), ml_est = c(mu, tau_c, tau_s, tau_w)[1:(layers + 1)] ) if (fix_alpha) { ml_se <- c(NA_real_, sqrt(diag(solve(hyper_pars@details$hessian)))) } else { ml_se <- sqrt(diag(solve(hyper_pars@details$hessian))) } comparison$ml_se <- ml_se print(comparison) } if (plot == T) { list("factors" = factor_summary) %>% eb_plots() } ret_list <- list( "input" = data, # "clusters" = cluster_summary, "factors" = factor_summary, "factor_mean" = ai_mean, "factor_cov" = ai_cov, "theta" = theta, "sigma" = sigma ) if (is.null(sigma)) { ret_list[["alpha_cor_raw"]] <- alpha_cor_shrunk ret_list[["alpha_cor_adj"]] <- alpha_cor_adj } if (is.null(priors)) { ret_list[["mle"]] <- comparison } if (layers == 3) { ret_list$signal <- signal_summary } return(ret_list) } fdr_sim <- function(t_low, a_vec, a_cov, n_sim = 10000, seed=1) { set.seed(seed) t_all <- a_vec / sqrt(diag(a_cov)) t_steps <- sort(t_all[t_all > t_low]) t_steps <- head(t_steps, -1) # Don't include the last t-value (no significant) # Simulated alphas sims <- mvtnorm::rmvnorm(n = n_sim, mean = a_vec, sigma = a_cov) # False Discovery as a Function of t-cutoff t_steps %>% lapply(function(t) { # Significant alphas under t-cutoff sig <- (t_all >= t) # False Discovery Rate sims_fdr <- rowMeans(sign(sims[, sig]) != sign(a_vec[sig])) # Output tibble(t_cutoff = t, n_sig = sum(sig), fdr = mean(sims_fdr), fwr = mean(sims_fdr > 0)) }) %>% bind_rows() } fdr_fwer_rates <- function(t_cutoff, a_vec, a_cov, orig_sig = F, n_sim = 10000, seed=1) { set.seed(seed) # Simulate from full posterior sims <- mvtnorm::rmvnorm(n = n_sim, mean = a_vec, sigma = a_cov) if (orig_sig == T) { orig_factors <- char_info %>% filter(significance == T) %>% pull(characteristic) %>% str_c("__world") sims <- sims[, match(x = orig_factors, table = rownames(a_vec))] a_vec <- a_vec[orig_factors, ] a_cov <- a_cov[orig_factors, orig_factors] } t_all <- a_vec / sqrt(diag(a_cov)) sig <- (t_all >= t_cutoff) sig_sims <- sims[, sig] false_discoveries <- sig_sims %>% apply(1, function(x) mean(x<0)) # FDR Distribution fdr_dist <- tibble( min = min(false_discoveries), p025 = quantile(false_discoveries, 0.025), p50 = quantile(false_discoveries, 0.5), p975 = quantile(false_discoveries, 0.975), max = max(false_discoveries), mean = mean(false_discoveries), sd = sd(false_discoveries) ) fwer_dist <- tibble( min = min(false_discoveries!=0), p025 = quantile(false_discoveries!=0, 0.025), p50 = quantile(false_discoveries!=0, 0.5), p975 = quantile(false_discoveries!=0, 0.975), max = max(false_discoveries!=0), mean = mean(false_discoveries!=0), sd = sd(false_discoveries!=0) ) # FWER Distribution # Output print(paste("Factors:", length(t_all), "- Sig:", sum(sig))) print(paste("Mean p-value:", round(mean(1-pnorm(t_all[sig])), 6))) fwer_fdr <- tibble(t_cutoff = t_cutoff, n_sig = sum(sig), fdr = mean(false_discoveries), fwer = mean(false_discoveries > 0)) list("fdr_dist"=fdr_dist, "fwer_dist" = fwer_dist, "fwer_fdr"=fwer_fdr) } # True Factors true_factors <- function(t_cutoff, a_vec, a_cov, orig_sig = T, n_sim = 10000, seed=1) { set.seed(seed) post_vol <- sqrt(diag(a_cov)) # Simulate using all factors sims <- mvtnorm::rmvnorm(n = n_sim, mean = a_vec, sigma = a_cov) # Decide which factors to look at if (orig_sig == T) { orig_factors <- char_info %>% filter(significance == T) %>% pull(characteristic) %>% str_c("__world") orig_factors_match <- match(x = orig_factors, table = rownames(a_vec)) sims <- sims[, orig_factors_match] post_vol <- post_vol[orig_factors_match] a_vec <- a_vec[orig_factors_match] } sims <- 1:ncol(sims) %>% sapply(function(i) sims[, i] / post_vol[i]) true_factors_dist <- sims %>% apply(1, function(x) mean(x > t_cutoff)) true_stat <- mean(a_vec/post_vol > t_cutoff) # From point 4 in https://influentialpoints.com/Training/bootstrap_confidence_intervals-principles-properties-assumptions.htm bc_ci <- function(stat, bootstraps, alpha=0.05) { # bias corrected bootstrap standard errors # estimate bias in std. norm deviates b <- qnorm((sum(bootstraps > stat)+sum(bootstraps==stat)/2)/length(bootstraps)) # Proportion of bootstrap samples above the "population" estimate. If unbiased, b=0.5. To handle discrete statistics, half of the sample at the population estimate is assumed to lie above z <- qnorm(c(alpha/2,1-alpha/2)) # Std. norm. limits p <- pnorm(z-2*b) # bias-correct & convert to proportions quantile(bootstraps,p=p) # Bias-corrected percentile lims. } bias_corrected <- bc_ci(stat = true_stat, bootstraps = true_factors_dist, alpha = 0.05) tibble( min = min(true_factors_dist), p025 = quantile(true_factors_dist, 0.025), p50 = quantile(true_factors_dist, 0.5), p975 = quantile(true_factors_dist, 0.975), max = max(true_factors_dist), mean = mean(true_factors_dist), sd = sd(true_factors_dist), p025_bc = bias_corrected[1], p975_bc = bias_corrected[2] ) } # Simulations for figure 2 sim_mt_control <- function(sim_settings) { # Cluster membership m <- matrix(0, nrow = sim_settings$n, ncol = sim_settings$clusters) # Cluster membership j <- 0 for (i in 1:sim_settings$clusters) { m[(j+1):(j + sim_settings$fct_pr_cl), i] <- 1 j <- j + sim_settings$fct_pr_cl } # Correlation Matrix corr_mat <- m %*% t(m) corr_mat[corr_mat == 0] <- sim_settings$corr_across corr_mat[corr_mat == 1] <- sim_settings$corr_within diag(corr_mat) <- 1 # Sigma sigma <- sim_settings$se^2 * corr_mat # Predefine variables alpha_0_vec <- rep(sim_settings$alpha_0, sim_settings$n) i_n <- diag(sim_settings$n) # Simulation search_grid <- expand.grid("tau_c" = sim_settings$tau_c, "tau_w" = sim_settings$tau_w) 1:nrow(search_grid) %>% lapply(function(i) { tau_c <- search_grid[i, "tau_c"] tau_w <- search_grid[i, "tau_w"] print(paste("Iteration", i, "out of", nrow(search_grid))) alpha_noise <- MASS::mvrnorm(n = sim_settings$n_sims, mu = rep(0, sim_settings$n), Sigma = sigma) # Preallocate alpha noise for efficiency s <- 1:sim_settings$n_sims %>% lapply(function(s) { omega <- m %*% t(m) * tau_c^2 + i_n * tau_w^2 alpha_c <- rnorm(sim_settings$clusters) * tau_c alpha_w <- rnorm(sim_settings$n) * tau_w alpha_true <- alpha_0_vec + m %*% alpha_c + alpha_w alpha_hat <- alpha_true + alpha_noise[s, ] post_var <- solve(solve(omega) + solve(sigma)) post_alpha <- post_var %*% (solve(omega) %*% alpha_0_vec + solve(sigma) %*% alpha_hat) eb <- tibble("type" = "eb", "true_alpha" = drop(alpha_true), "z" = drop(post_alpha / sqrt(diag(post_var))), "p" = 2 * pnorm(abs(z), lower.tail = F)) ols <- tibble("type" = "ols", "true_alpha" = drop(alpha_true), "z" = drop(alpha_hat / sqrt(diag(sigma))), "p" = 2 * pnorm(abs(z), lower.tail = F)) by <- tibble("type" = "by", "true_alpha" = drop(alpha_true), "z" = ols$z) by$p <- p.adjust(ols$p, method = "BY") rbind(eb, ols, by) %>% mutate(sig = z > 0 & p < 0.025) %>% group_by(type) %>% summarise( sim = s, n_disc = sum(sig), true_disc = sum(sign(true_alpha[sig == T]) == sign(z[sig == T])), false_disc = n_disc - true_disc ) }) %>% bind_rows() s %>% group_by(type) %>% mutate(fdp = if_else(n_disc == 0, 0, false_disc / n_disc)) %>% summarise( fdr = mean(fdp), n_disc = mean(n_disc), false_disc = mean(false_disc), true_disc = mean(true_disc), tau_c = tau_c, tau_w = tau_w, n = n() ) %>% mutate(true_disc_rate = true_disc / (sim_settings$n / 2)) }) %>% bind_rows() } multiple_testing <- function(eb_all, eb_world = NULL) { eb_all$factors %>% bind_rows(eb_world$factors) %>% mutate( t_ols = ols_est/ols_se, p_ols = 2*pnorm(abs(t_ols), lower.tail = F) ) %>% group_by(region) %>% mutate( n = n(), p_bonf = p_ols %>% p.adjust(method = "bonferroni"), p_holm = p_ols %>% p.adjust(method = "holm"), p_bh = p_ols %>% p.adjust(method = "BH"), p_by = p_ols %>% p.adjust(method = "BY") ) %>% select(n, region, char_reg, "estimate" = ols_est, "statistic" = t_ols, "se" = ols_se, starts_with("p_")) %>% gather(starts_with("p_"), key = "method", value = "p") %>% mutate( method = method %>% str_remove("^p_"), mt_adj = case_when( method == "ols" ~ "None", method == "bh" ~ "FDR", method == "by" ~ "FDR", method == "bonf" ~ "FWR", method == "holm" ~ "FWR" ), method = case_when( method == "ols" ~ "OLS", method == "bh" ~ "BH", method == "by" ~ "BY", method == "bonf" ~ "Bonferroni", method == "holm" ~ "Holm", TRUE ~ method ) ) } # Bootstrap Tangency Portfolio -- # BS Func bootstrap_tpf <- function(data, n_boots = 100, shorting = T, seed = 1) { set.seed(seed) if (shorting) { boot_func <- function(splits, ...) { df <- analysis(splits) %>% apply(2, function(x) x / sd(x)) %>% as.data.frame() lm(rep(1, nrow(df)) ~ -1 + ., data = df) %>% broom::tidy() %>% mutate(weight = estimate / sum(estimate)) %>% mutate(term = term %>% str_remove_all("`")) %>% select(term, weight) } } else { boot_func <- function(splits, ...) { df <- analysis(splits) %>% apply(2, function(x) x / sd(x)) glmnet::glmnet(y = rep(1, nrow(df)), x = df %>% as.matrix(), lambda = 0, lower.limits = 0, intercept = F) %>% broom::tidy(return_zeros = T) %>% filter(term != "(Intercept)") %>% mutate(weight = estimate / sum(estimate)) %>% select(term, weight) } } data %>% bootstraps(times = n_boots, apparent = T) %>% # Apparent = T --> Generate original data mutate( coef = splits %>% map(.f = boot_func) ) } # Full tpf tpf_cluster <- function(data, mkt_region, orig_sig, min_date, n_boots, shorting, seed) { if (orig_sig) { orig_sig_values <- T } else { orig_sig_values <- c(T, F) } market_ret <- regional_mkt_ret[region == mkt_region] cluster_pf <- data %>% left_join(cluster_labels, by = "characteristic") %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% filter(orig_sig %in% orig_sig_values) %>% group_by(hcl_label, eom) %>% summarise( ret = mean(ret) ) tpf_data <- cluster_pf %>% filter(eom >= min_date) %>% spread(key = hcl_label, value = ret) %>% left_join(market_ret %>% select(eom, market), by = c("eom")) %>% rename(Market = market) tpf_data %>% select(-eom) %>% bootstrap_tpf(n_boots = n_boots, shorting = shorting, seed = seed) %>% mutate(market_region = mkt_region) } # In-Sample / Out-of-Sample Functions prepare_is_oos <- function(input, min_obs, orig_group, ret_scaled, type, print=F) { # ret_scaled in ('none', "all", "is") & type in ('is_oos', 'is_post', 'is_pre') data <- input %>% select(characteristic, eom, ret, mkt_vw_exc) %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% filter(orig_sig %in% orig_group) %>% left_join(char_info %>% select(characteristic, sample_start, sample_end), by = "characteristic") %>% mutate( period = case_when( year(eom) >= sample_start & year(eom) <= sample_end ~ "is", type == "pre" & year(eom) < sample_start ~ "oos", type == "post" & year(eom) > sample_end ~ "oos", type == "pre_post" & (year(eom) < sample_start | year(eom) > sample_end) ~ "oos" ), ret = ret * 100, mkt_vw_exc = mkt_vw_exc * 100 ) %>% filter(!is.na(period)) # Exclude data data_excl <- data %>% group_by(characteristic) %>% mutate(n_is = sum(period == "is"), n_oos = sum(period == "oos")) %>% filter(n_is >= min_obs & n_oos >= min_obs) if (ret_scaled == "none") { data_adj <- data_excl %>% mutate(ret_adj = ret) } if (ret_scaled == "all") { data_excl <- data_excl %>% group_by(characteristic, period) %>% mutate( ret_neu = ret - cov(ret, mkt_vw_exc)/var(mkt_vw_exc) * mkt_vw_exc, ret_adj = ret * (10/sqrt(12))/sd(ret_neu)) %>% select(-ret_neu) %>% ungroup() } if (ret_scaled == "is") { is_vol <- data_excl %>% filter(period == "is") %>% group_by(characteristic) %>% mutate(ret_neu = ret - cov(ret, mkt_vw_exc)/var(mkt_vw_exc) * mkt_vw_exc) %>% summarise( is_sd = sd(ret_neu) ) data_excl <- data_excl %>% left_join(is_vol, by = c("region", "characteristic")) %>% group_by(characteristic, period) %>% mutate(ret_adj = ret * (10/sqrt(12))/is_sd) %>% ungroup() %>% select(-is_sd) } full <- data %>% summarise(fct_all = uniqueN(characteristic)) excl <- data_excl %>% summarise(fct_excl = uniqueN(characteristic)) if (print) { print(tibble("type"=type, full, excl)) } return(data_excl) } # Economic Benefit of more Power trading_on_significance <- function(posterior_is) { pf_base <- posterior_is %>% left_join(char_info %>% select(characteristic, significance, sample_end), by = "characteristic") %>% filter(significance == 1 & est_date >= sample_end) %>% group_by(est_date) %>% mutate( ols_p = pnorm(abs(ols_est / ols_se), lower.tail = F)*2, by_p = p.adjust(ols_p, method = "BY") ) print(pf_base %>% summarise( rr_eb = mean(p025 > 0), rr_ols = mean(ols_p <= 0.05 & ols_est > 0), rr_by = mean(by_p <= 0.05 & ols_est > 0), ) %>% gather(rr_eb, rr_ols, rr_by, key = "type", value = "rr") %>% ggplot(aes(est_date, rr, colour = type)) + geom_point() + geom_line()) positions <- pf_base %>% ungroup() %>% mutate( position_year = year(est_date) + 1, eb_sig = (p025 > 0), by_sig = (by_p <= 0.05 & ols_est > 0) ) %>% select(position_year, characteristic, eb_sig, by_sig) candidate_factors <- regional_pfs %>% mutate(year = year(eom)) %>% left_join(positions, by = c("characteristic", "year" = "position_year")) %>% mutate(marg_sig = (eb_sig == T & by_sig == F)) %>% gather(marg_sig, eb_sig, by_sig, key = "type", value = "significant") candidate_factors %>% filter(significant == T) %>% group_by(region, type, significant, eom) %>% summarise( n = n(), ret = mean(ret), mkt = mean(mkt_vw_exc) ) } # Simulation according to specification for Harvey et al (2016) harvey_et_al_sim <- function(sim_settings, seed) { set.seed(seed) # Cluster membership m <- matrix(0, nrow = sim_settings$n, ncol = sim_settings$cl) # Cluster membership j <- 0 for (i in 1:sim_settings$cl) { m[(j+1):(j + sim_settings$fct_pr_cl), i] <- 1 j <- j + sim_settings$fct_pr_cl } mm <- m %*% t(m) # Correlation Matrix corr_mat <- mm corr_mat[corr_mat == 0] <- sim_settings$corr_across corr_mat[corr_mat == 1] <- sim_settings$corr_within diag(corr_mat) <- 1 # Average Correlation (Should be close to zero) mean(corr_mat[lower.tri(corr_mat)]) # Sigma sigma <- sim_settings$se^2 * corr_mat # Predefine variables alpha_0_vec <- rep(sim_settings$alpha_0, sim_settings$n) i_n <- diag(sim_settings$n) # Simulation sim_settings$tau_ws %>% lapply(function(tau_w) { start <- proc.time() alpha_noise <- MASS::mvrnorm(n = sim_settings$n_sims, mu = rep(0, sim_settings$n), Sigma = sigma) # Preallocate alpha noise for efficiency tau_sim <- 1:sim_settings$n_sims %>% sapply(simplify = F, USE.NAMES = T, function(s) { print(paste("Tau_w:", tau_w, "- Simulation", s, "out of", sim_settings$n_sims)) # Simulate Alphas alpha_c <- c(rep(sim_settings$ret, times = sim_settings$cl_true), rep(0, times = (sim_settings$cl - sim_settings$cl_true))) alpha_w <- c(rnorm(sim_settings$n_true) * tau_w, rep(0, sim_settings$n - sim_settings$n_true)) alpha_true <- alpha_0_vec + m %*% alpha_c + alpha_w alpha_hat <- as.vector(alpha_true + alpha_noise[s, ]) # MLE Function mle_func <- function(a, tc, tw) { a_vec <- rep(a, sim_settings$n) a_omega <- i_n * tw^2 + mm * tc^2 a_cov <- sigma + a_omega # / t_mat -(mvtnorm::dmvnorm(x = alpha_hat, mean = a_vec, sigma = a_cov, log = T)) } # Starting Values starting_values <- tibble(a = alpha_hat, cl = rep(1:sim_settings$cl, each = sim_settings$fct_pr_cl)) %>% group_by(cl) %>% summarise( cl_mean = mean(a), cl_sd = sd(a) ) %>% summarize( crude_a0 = mean(cl_mean), crude_tc = if_else(sim_settings$fix_alpha, sqrt(sum((cl_mean^2) / (n() - 1))), sd(cl_mean)), crude_tw = mean(cl_sd) ) start_list <- list( a = starting_values$crude_a0, tc = starting_values$crude_tc, tw = starting_values$crude_tw) # Estimate Parameters if (sim_settings$fix_alpha) { (hyper_pars <- stats4::mle(minuslogl = mle_func, start = start_list, lower = c(-Inf, 0, 0), fixed = list(a = 0))) } else { (hyper_pars <- stats4::mle(minuslogl = mle_func, start = start_list, lower = c(-Inf, 0, 0))) } # Check convergence if (hyper_pars@details$convergence != 0) { warning("MLE step did not converge!!!") return(NULL) } mu <- hyper_pars@fullcoef["a"] tc <- hyper_pars@fullcoef["tc"] tw <- hyper_pars@fullcoef["tw"] mle <- tibble( s = rep(s, 3), coef = c("a", "tc", "tw"), mle = c(mu, tc, tw), crude = c(start_list$a, start_list$tc, start_list$tw) ) print(mle) # Specify Posterior omega <- i_n * tw^2 + mm * tc^2 post_cov <- solve(solve(omega) + solve(sigma)) post_alpha <- post_cov %*% (solve(omega) %*% alpha_0_vec + solve(sigma) %*% alpha_hat) list("alpha_true" = alpha_true, "alpha_hat" = alpha_hat, "post_alpha" = post_alpha, "post_cov" = post_cov, "mle" = mle) }) print(proc.time() - start) # 5 iterations took 232.13/60 = 4 minutes return(tau_sim) }) } # Single Factor TP -- sr_func <- function(data, w) { ret_vec <- data %>% colMeans() cov_mat <- data %>% cov() drop(w %*% ret_vec / sqrt(t(w) %*% cov_mat %*% w)) } epo_tpf <- function(data, s) { sd <- data %>% apply(2, sd) cor <- data %>% cor() ret_vec <- data %>% colMeans() cor_shrunk <- diag(length(sd)) * s + cor * (1-s) cov_shrunk <- diag(sd) %*% cor_shrunk %*% diag(sd) drop((solve(cov_shrunk) %*% ret_vec) / sum(solve(cov_shrunk) %*% ret_vec)) } # Prepare data for prepare_tpf_factors <- function(region, orig_sig_values, start, scale) { mkt <- regional_mkt_ret %>% filter(region == !!region) %>% select(-region) tpf_factors <- eb_est[[region]]$input$long %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% filter(orig_sig %in% orig_sig_values) %>% group_by(eom) %>% mutate(n = n()) %>% ungroup() %>% filter(eom >= start) %>% select(characteristic, eom, n, ret) # Ensure that all factors have data missing <- tpf_factors %>% filter(n != max(n)) %>% select(eom, n) %>% distinct() if (nrow(missing) > 0) { warning("UNBALANCED PANEL - SOME FACTORS ARE MISSING DATA!") } tpf_factors <- tpf_factors %>% filter(n == max(n)) %>% select(-n) # 1. Overall TPF tpf_factors <- tpf_factors %>% bind_rows(mkt %>% rename("ret"="market") %>% mutate(characteristic = "market") %>% filter(eom %in% tpf_factors$eom)) if (scale) { tpf_factors <- tpf_factors %>% group_by(characteristic) %>% mutate(ret = ret * (0.1 / sqrt(12)) / sd(ret)) } tpf_factors_wide <- tpf_factors %>% pivot_wider(names_from = characteristic, values_from = ret) %>% select(-eom) # Output list("long"=tpf_factors, "wide" = tpf_factors_wide) } # Optimal Shrinkage optimal_shrinkage <- function(data, k, epo_range = seq(0, 1, 0.1)) { finance_kfold <- function(dates, k, horizon) { # Helper Function helper_eom_seq <- function(ends, horizon) { all <- ends %>% lapply(function(d) seq.Date(from = ceiling_date(d, unit = "month") - months(horizon - 1), to = ceiling_date(d, unit = "month"), by = "1 month") - 1) all_unique <- do.call(c, all) %>% unique() } # Split Dates into k groups date_vec <- dates %>% unique() %>% sort() n <- length(date_vec) n_fold <- floor(length(date_vec) / k) split <- split(date_vec, cut(seq_along(date_vec), k, labels = FALSE)) # Create train/test split 1:k %>% lapply(function(i) { test_ends <- split[[i]] test_dates <- test_ends %>% helper_eom_seq(horizon = horizon) train_ends <- do.call(c, split[-i]) train_dates <- train_ends %>% helper_eom_seq(horizon = horizon) # Avoid Data Leakage train_dates <- train_dates[!(train_dates %in% test_dates)] tibble(fold = i, train = list(train_dates), test = list(test_dates)) }) %>% bind_rows() } date_split <- unique(data$eom) %>% finance_kfold(k = k, horizon = 1) cross_val <- 1:k %>% lapply(function(i) { test_dates <- date_split$test[[i]] test <- data %>% filter(eom %in% test_dates) %>% pivot_wider(names_from = characteristic, values_from = ret) test_eom <- test$eom test <- test %>% select(-eom) train_dates <- date_split$train[[i]] train <- data %>% filter(eom %in% train_dates) %>% pivot_wider(names_from = characteristic, values_from = ret) %>% select(-eom) # Create Weights nonneg_w <- glmnet::glmnet(y = rep(1, nrow(train)), x = train %>% as.matrix(), lambda = 0, lower.limits = 0, intercept = F) %>% tidy(return_zeros = T) %>% filter(term != "(Intercept)") %>% mutate( type = "Non-Negative", weight = estimate / sum(estimate) ) %>% select(type, weight) epo_w <- epo_range %>% lapply(function(s) { epo <- train %>% epo_tpf(s = s) tibble(type = paste0("EPO s=", s), weight = epo) }) # OOS Performance c(list(nonneg_w), epo_w) %>% lapply(function(x) { w <- x$weight tibble( type = unique(x$type), eom = test_eom, ret = drop(as.matrix(test) %*% w) ) }) %>% bind_rows() %>% mutate( i = i, test_range = paste0(year(min(test_dates)), "-", year(max(test_dates))) ) }) %>% bind_rows() cross_val_summary <- cross_val %>% group_by(type) %>% summarise( ann_ret = mean(ret), sd = sd(ret), sr = ann_ret/sd ) %>% mutate( type_overall = if_else(str_detect(type, "EPO"), "EPO", "Non-Negative"), type = if_else(type == "EPO s=0", "Unconstrained", type), type = type %>% factor(levels = c("Non-Negative", "Unconstrained", paste0("EPO s=", seq(0.1, 1, 0.1)))) ) print(cross_val_summary %>% ggplot(aes(type, sr, group=type_overall)) + geom_point() + geom_path() + labs(colour = "Test Period:", y = "Monthly OOS SR of TPF") + theme(legend.position = "top", axis.title.x = element_blank(), axis.text.x = element_text(angle = 45, vjust=0.5))) opt_s_summary <- cross_val_summary %>% filter(type_overall == "EPO" & sr == max(sr)) %>% mutate(s = type %>% str_remove("EPO s=") %>% as.numeric()) print(paste0("Highest OOS SR: ", opt_s_summary %>% pull(sr) %>% round(2), ", Standard MVO: ", cross_val_summary %>% filter(type == "Unconstrained") %>% pull(sr) %>% round(2))) # Optimal shrinkage opt_s_summary %>% pull(s) } # Table Functions -------------------------------------------------------- table_is_oos_ols <- function(is_oos_regs, is_post_regs) { oos_us <- lm(oos ~ is, data = is_oos_regs %>% filter(region == "us")) post_us <- lm(post ~ is, data = is_post_regs %>% filter(region == "us")) stargazer::stargazer( post_us, oos_us, title = "OLS - Biased: $\\hat{\\alpha}_\\text{Out-of-Sample} = \\gamma_0 + \\gamma_1\\times\\hat{\\alpha}_\\text{In-Sample}$", out.header=T, no.space=T, digits=3, type='latex', single.row=F, align = T, notes.align = "l", omit.stat = c("adj.rsq", "f", "ser"), covariate.labels = c("Intercept", "$\\hat{\\alpha}_\\text{IS}$"), dep.var.caption = "Dependent Variable:", dep.var.labels = c("$\\hat{\\alpha}_\\text{Post IS}$", "$\\hat{\\alpha}_\\text{Pre \\& Post IS}$"), notes.append=T, report = "vc*t", intercept.bottom = F) } table_is_oos_nls <- function(nls_post, nls_oos) { info <- list( "post" = list( "dep" = "post", "fit" = nls_post ), "oos" = list( "dep" = "oos", "fit" = nls_oos ) ) op <- info %>% sapply(simplify = F, USE.NAMES = T, function(x) { # Create Fake Linear Model fake_data <- tibble(y = rnorm(10), k0 = rnorm(10), kh = rnorm(10)) colnames(fake_data) <- c(x$dep, "k0", "kh") lm_string <- paste0(x$dep, "~k0 + kh -1") fake_lm = lm(lm_string, data = fake_data) fake_x = c("k0", "kh") # Generate various parts for output sum_xx = summary(x$fit$nls_fit) mat_xx = sum_xx$coefficients[1:2, ] colnames(mat_xx) = c("coef","se", "t", "p") indVarNames = rownames(mat_xx) # Generate coefficients, se, t-stat and p values df_xx = as.data.frame(mat_xx) vCoef = df_xx$coef; names(vCoef)=fake_x vSE = df_xx$se; names(vSE)=fake_x vT = df_xx$t; names(vT)=fake_x vP = df_xx$p; names(vP)=fake_x formulaTxt = sum_xx$formula nParameters = sum_xx$df[1] nDF = sum_xx$df[2] obs <- length(x$fit$nls_fit$m$resid()) n_fcts <- uniqueN(x$fit$nls_data$c) rss = round(sum_xx$sigma, 3) convTolerance = x$fit$nls_fit$m$conv() list("lm"=fake_lm, "coef" = vCoef, "se" = vSE, "t" = vT, "p" = vP, "rss" = rss, "obs" = obs, "n_fcts" = n_fcts) }) # Determine order y1 <- "post" y2 <- "oos" # Aesthetics vTitle = "NLS - Unbiased: $R_{i,t} = \\alpha_i + (\\kappa_0 + \\kappa_h \\times \\alpha_i)\\times 1_\\text{\\{Out-of-Sample\\}}$" vType = "latex" # v_col_label = c("USA", "Developed", "Emerging") lines_obs <- c("Observations", sprintf("\\multicolumn{1}{r}{%s}", prettyNum(op[[y1]]$obs, big.mark = ",")), sprintf("\\multicolumn{1}{r}{%s}", prettyNum(op[[y2]]$obs, big.mark = ","))) lines_fcts <- c("Factors", sprintf("\\multicolumn{1}{r}{%s}", op[[y1]]$n_fcts), sprintf("\\multicolumn{1}{r}{%s}", op[[y2]]$n_fcts)) dep_var_options <- list("post" = "Post IS", "oos" = "Pre \\& Post IS") dep_var_lbls <- c(dep_var_options[[y1]], dep_var_options[[y2]]) # Output stargazer::stargazer( op[[y1]]$lm, op[[y2]]$lm, title = vTitle, out.header=T, no.space=T, digits=3, type=vType, single.row=F, align = T, notes.align = "l", omit.stat = c("rsq","adj.rsq", "f", "n", "ser"), covariate.labels = c("$\\kappa_0$", "$\\kappa_h$"), dep.var.labels.include = T, dep.var.caption = "OOS Period:", dep.var.labels = dep_var_lbls, add.lines=list(lines_fcts, lines_obs), report = "vc*t", intercept.bottom = F, # notes=vNotes, notes.append=T, coef=list(op[[y1]]$coef, op[[y2]]$coef), se=list(op[[y1]]$se, op[[y2]]$se), t=list(op[[y1]]$t, op[[y2]]$t), p=list(op[[y1]]$p, op[[y2]]$p) ) } table_taus <- function(){ taus <- list( list("USA", "us"), list("Developed", "developed"), list("Emerging", "emerging"), list("USA, Developed & Emerging", "all"), list("World", "world"), list("World ex. US", "world_ex_us"), list("USA - Mega", "us_mega"), list("USA - Large", "us_large"), list("USA - Small", "us_small"), list("USA - Micro", "us_micro"), list("USA - Nano", "us_nano") ) %>% lapply(function(x) { eb_est[[x[[2]]]]$mle %>% select(estimate, ml_est) %>% spread(key = estimate, value = ml_est) %>% mutate(sample = x[[1]]) }) %>% bind_rows() %>% select(sample, tau_c, tau_s, tau_w) tau_cap <- paste( "The table shows the tau parameters estimated by maximum likelihood.", "$\\tau_c$ is the estimated dispersion in cluster alphas.", "$\\tau_w$ is the estimated dispersion in factor alphas with a cluster.", "$\\tau_s$ is the estimated dispersion in alpha of the same factor in different regions." ) taus %>% select("Sample" = sample, "$\\tau_c$" = tau_c, "$\\tau_w$" = tau_s, "$\\tau_s$" = tau_w) %>% # Here I use the notation from eq 23 xtable(auto=T, digits = 2, caption = tau_cap) %>% print(include.rownames = F, caption.placement = "top", sanitize.colnames.function = identity) } # Table - Factor Performance table_factor_info <- function() { table <- eb_est$all$factors %>% mutate(p_zero = pnorm(q = 0, mean = post_mean, sd = post_sd)) %>% select(characteristic, region, ols_est, "eb_est" = post_mean, p_zero) %>% pivot_wider(names_from = region, values_from = c(ols_est, eb_est, p_zero), names_sep = "_") %>% select(characteristic, ends_with("_us"), ends_with("_developed"), ends_with("_emerging")) %>% left_join(char_info %>% select(characteristic, significance), by = "characteristic") %>% mutate(char_name = if_else(significance == 0, paste0(characteristic, "*"), characteristic)) %>% select(-characteristic, -significance) %>% select(char_name, everything()) %>% arrange(ols_est_us) %>% as.data.frame() data.frame(table[, 1:4], "empty1" = rep("", nrow(table)), table[, 5:7], "empty2" = rep("", nrow(table)), table[, 8:10]) %>% xtable() %>% print() } table_economic_benefit <- function(sig_pfs) { sig_regs <- c("us", "developed", "emerging") %>% lapply(function(x) { fit <- lm(ret ~ mkt, data = sig_pfs %>% filter(type == "marg_sig" & region == x) %>% mutate(ret = ret*100, mkt = mkt*100)) # Ensures alpha is in Percentages nw <- fit %>% lmtest::coeftest(vcov = sandwich::NeweyWest(fit, lag = 6)) list("fit"=fit, "nw"=nw) }) lines_obs <- c("Observations", sprintf("\\multicolumn{1}{r}{%s}", prettyNum(length(sig_regs[[1]]$fit$residuals), big.mark = ",")), sprintf("\\multicolumn{1}{r}{%s}", prettyNum(length(sig_regs[[2]]$fit$residuals), big.mark = ",")), sprintf("\\multicolumn{1}{r}{%s}", prettyNum(length(sig_regs[[3]]$fit$residuals), big.mark = ","))) lines_r2 <- c("Adjusted $R^2$", sprintf("\\multicolumn{1}{r}{%s}", formatC(summary(sig_regs[[1]]$fit)$adj.r.squared, digits = 2, format = "f")), sprintf("\\multicolumn{1}{r}{%s}", formatC(summary(sig_regs[[2]]$fit)$adj.r.squared, digits = 2, format = "f")), sprintf("\\multicolumn{1}{r}{%s}", formatC(summary(sig_regs[[3]]$fit)$adj.r.squared, digits = 2, format = "f"))) stargazer::stargazer(sig_regs[[1]]$nw, sig_regs[[2]]$nw, sig_regs[[3]]$nw, dep.var.labels.include = F, dep.var.caption = "Region", no.space = F, intercept.bottom = F, report = "vc*t", column.labels = c("US", "Developed ex. US", "Emerging"), add.lines=list(lines_obs, lines_r2), covariate.labels = c("Alpha", "Market Beta"), align=T, digits=2) } # PLOT FUNCTIONS --------------------------------------------------------- cluster_val <- function(cor, labels, op_format = "pdf") { pairwise_cor <- cor %>% as_tibble(rownames = "char1") %>% gather(-char1, key = "char2", value = "cor") %>% left_join(select(labels, characteristic, "label1" = hcl_label), by = c("char1"="characteristic")) %>% left_join(select(labels, characteristic, "label2" = hcl_label), by = c("char2"="characteristic")) %>% filter(char1 != char2) %>% mutate(hcl_pair = str_c(label1, "_", label2)) %>% group_by(hcl_pair) %>% summarise( n = n(), cor_avg = mean(cor) ) %>% ungroup() %>% separate(hcl_pair, c("hcl1", "hcl2"), sep = "_") %>% select(hcl1, hcl2, cor_avg) %>% spread(key = hcl2, cor_avg) pairwise_cor_names <- pairwise_cor$hcl1 pairwise_cor <- pairwise_cor %>% select(-hcl1) %>% as.matrix() rownames(pairwise_cor) <- pairwise_cor_names # Needs to Be saved as a functional if (op_format == "tex") { corrplot_cex <- list(tl = 0.8, number = 0.5) } if (op_format == "pdf") { corrplot_cex <- list(tl = 0.7, number = 0.45) } function() { par(xpd=TRUE) pairwise_cor %>% corrplot::corrplot(method = "color", addCoef.col = "black", type = "lower", mar = c(0, 0, 3, 0), tl.cex = corrplot_cex$tl, number.cex = corrplot_cex$number, tl.col = "black", col = colorRampPalette(c(colours_theme[2], "white", colours_theme[1]))(200)) } } plot_mt_eb_comp <- function(mt, eb_all, eb_us = NULL, eb_developed = NULL, eb_emerging = NULL, eb_world = NULL, mts = c("OLS", "Bonferroni", "BY"), regs = c("us", "developed", "emerging", "world"), se_methods, se_regions) { mt_sub <- mt %>% mutate(method = method %>% factor(levels = c("OLS", "Bonferroni", "Holm", "BH", "BY", "EB - Region", "EB - Full"))) %>% filter(method %in% mts & region %in% regs) (t_cutoff <- mt_sub %>% group_by(method, region) %>% summarise( t_cut = (min(abs(statistic)[p < 0.05]) + max(abs(statistic)[p > 0.05])) / 2 )) eb_comb <- bind_rows( eb_all$factors %>% mutate(method = "EB - All"), eb_us$factors %>% mutate(method = "EB - Region"), eb_developed$factors %>% mutate(method = "EB - Region"), eb_emerging$factors %>% mutate(method = "EB - Region"), eb_world$factors %>% mutate(method = "EB - Region") ) %>% mutate( method = method %>% factor(levels = c("OLS", "Bonferroni", "Holm", "BH", "BY", "EB - Region", "EB - All")) ) mt_table <- mt_sub %>% mutate(characteristic = char_reg %>% str_remove("__.+")) %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% filter(orig_sig == T) %>% group_by(region, method, mt_adj) %>% summarise( n = n(), significant = mean(p < 0.05 & estimate > 0), # Estimates also needs to be positive max_t_insig = max(abs(statistic[p > 0.05])) ) eb_table <- eb_comb %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% filter(orig_sig == T) %>% mutate(ols_t = ols_est / ols_se) %>% group_by(region, method) %>% summarise( mt_adj = "Bayesian", n = n(), significant = mean(p025 > 0), max_t_insig = max(abs(ols_t[p025 < 0 & p975 > 0])) ) (comp_table <- mt_table %>% bind_rows(eb_table)) # To install drlib put: devtools::install_github("dgrtwo/drlib") if (FALSE) { repl_plot <- comp_table %>% mutate( region_pretty = case_when( region == "us" ~ "US", region == "developed" ~ "Developed Ex. US", region == "emerging" ~ "Emerging", region == "world" ~ "World" ), region_pretty = region_pretty %>% factor(levels = c("US", "Developed Ex. US", "Emerging", "World")) ) %>% filter((region == "world" & method == "EB - All") | region != "world") %>% filter(method != "Bonferroni") %>% # filter(!(region == "world" & method != "Empirical Bayes")) %>% ggplot(aes(drlib::reorder_within(method, significant, region_pretty), significant, fill = method)) + geom_col() + drlib::scale_x_reordered() + geom_text(aes(label = str_c(formatC(round(significant * 100, 2), digits = 2, format = "f"), "%")), nudge_y = 0.025, size = 3) + facet_wrap(~region_pretty, nrow = 1, scales = "free_x") + labs(x = "Method", fill = "Multiple Testing Adj.", y = "Replication Rate (%)") + theme(legend.position = "none") } repl_plot <- comp_table %>% # filter((region == "world" & method == "EB - All") | region != "world") %>% filter(method != "Bonferroni") %>% group_by(method) %>% # mutate(sort_var = significant[region == "us"]) %>% mutate( region_pretty = case_when( region == "us" ~ "US", region == "developed" ~ "Developed Ex. US", region == "emerging" ~ "Emerging", region == "world" ~ "World" ), region_pretty = region_pretty %>% factor(levels = c("US", "Developed Ex. US", "Emerging", "World")), method_pretty = case_when( method == "BY" ~ "Benjamini-Yekutieli", method == "EB - Region" ~ "Empirical Bayes - Region", method == "EB - All" ~ "Empirical Bayes - All", method == "OLS" ~ "OLS" ), method_pretty = method_pretty %>% factor(levels = c("OLS", "Benjamini-Yekutieli", "Empirical Bayes - Region", "Empirical Bayes - All")) ) %>% ggplot(aes(method_pretty, significant*100, fill = method_pretty)) + #reorder(method_pretty, sort_var) geom_col() + geom_text(aes(label = str_c(formatC(round(significant * 100, 1), digits = 1, format = "f"), "%")), nudge_y = 2.5, size = 3) + facet_grid(. ~ region_pretty, scales = "free", space='free') + # facet_wrap(~region_pretty, nrow = 1, scales = "free_x") + labs(x = "Method", fill = "Multiple Testing Adj.", y = "Replication Rate (%)") + theme(legend.position = "none", axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 10), axis.title.x = element_blank()) eb_overview <- eb_comb %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% mutate( t_cut = 1.96, type = case_when( p025 > 0 & orig_sig == 1 ~ "Replicated", p025 <= 0 & orig_sig == 1 ~ "Not Replicated", orig_sig == 0 ~ "Never Significant" ) ) %>% select(region, method, type, char_reg, estimate = post_mean, t_cut, se = post_sd) mt_plot <- mt_sub %>% mutate(characteristic = char_reg %>% str_remove("__.+")) %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% left_join(t_cutoff, by = c("region", "method")) %>% mutate( # significant = if_else(p < 0.05 & estimate > 0, "Significant", "Insignificant"), type = case_when( p <= 0.05 & estimate > 0 & orig_sig == 1 ~ "Replicated", (p > 0.05 | estimate <= 0) & orig_sig == 1 ~ "Not Replicated", orig_sig == 0 ~ "Never Significant" ) ) %>% bind_rows(eb_overview) %>% filter(region %in% se_regions & method %in% se_methods) %>% mutate( characteristic = char_reg %>% str_extract(".+[?=__]") %>% str_remove("__"), type = type %>% factor(levels = c("Replicated", "Not Replicated", "Never Significant")), method_pretty = case_when( method == "BY" ~ "Multiple Testing - Benjamini-Yekutieli", method == "EB - Region" ~ "Empirical Bayes - US", method == "EB - All" ~ "Empirical Bayes - Global", method == "OLS" ~ "OLS" ), method_pretty = method_pretty %>% factor(levels = c("OLS", "Multiple Testing - Benjamini-Yekutieli", "Empirical Bayes - US", "Empirical Bayes - Global")) ) %>% group_by(characteristic) %>% # mutate(sort_var = statistic[method == "OLS" & region == "us"]) %>% mutate(sort_var = estimate[method == "OLS" & region == "us"]) %>% group_by(region, method) %>% mutate( ols_rank = frank(sort_var), repl_rate = sum(type == "Replicated") / sum(type %in% c("Replicated", "Not Replicated")) ) %>% ggplot(aes(reorder(ols_rank, sort_var), estimate, colour = type, linetype = type)) + geom_point() + geom_text(aes(x = 35, y = 1.45, label = str_c("Replication Rate: ", formatC(round(repl_rate*100, 1), digits = 1, format = "f"), "%")), colour = "black", size = 3, check_overlap = T) + geom_errorbar(aes(ymin = estimate - t_cut * se, ymax = estimate + t_cut * se)) + facet_wrap(~method_pretty, ncol = length(se_methods) / 2) + coord_cartesian(ylim = c(-1, 1.5)) + geom_hline(yintercept = 0, linetype = "dashed") + guides(colour = guide_legend(override.aes = list(shape = NA))) + labs(y = "Monthly Alpha (%)") + theme( axis.title.x = element_blank(), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1), # text = element_text(size = 13), legend.title = element_blank(), legend.position = "top" ) list("mt" = mt_plot, "repl" = repl_plot) } plot_fdr <- function(simulated_fdr) { simulated_fdr %>% gather(fdr, fwr, key = "type", value = "rate") %>% mutate(type = type %>% str_to_upper()) %>% ggplot(aes(t_cutoff, rate, colour = type)) + geom_point() + geom_hline(yintercept = 0.05, linetype = "dashed") + geom_vline(xintercept = 1.96, linetype = "dotted") + scale_y_continuous(breaks = c(0, 0.05, 0.25, 0.5, 0.75, 1)) + scale_x_continuous(breaks = c(0, 1.96, 2.5, 5.0, 7.5, 10)) + labs(x = "Critical Value (t)", y = "Rate", colour = "Type:") + theme( legend.position = "top" ) } plot_factor_post <- function(eb, orig_sig, cluster_order) { if (orig_sig) { orig_sig_values <- T } else { orig_sig_values <- c(T, F) } eb$factors %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% filter(orig_sig %in% orig_sig_values) %>% group_by(hcl_label) %>% mutate( sort_var = median(post_mean) + post_mean / 1000000 ) %>% mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>% ggplot(aes(reorder(characteristic, sort_var), post_mean, colour = hcl_label, shape = hcl_label)) + geom_point() + scale_shape_manual(values=1:13) + geom_errorbar(aes(ymin = post_mean - 1.96 * post_sd, ymax = post_mean + 1.96 * post_sd)) + geom_hline(yintercept = 0, linetype = "dashed") + labs(y = "Monthly Alpha with 95% Confidence Interval (%)", colour = "Cluster", shape = "Cluster") + theme( axis.title.x = element_blank(), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1) ) } plot_repl_region <- function(eb_all, cluster_order) { eb_all$factors %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% filter(orig_sig == 1) %>% mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>% group_by(region, hcl_label) %>% summarise(n = n(), repl_rate = mean(post_mean - 1.96 * post_sd > 0)) %>% group_by(hcl_label) %>% mutate( sort_var = repl_rate[region == "us"] + n[region == "us"] / 1e6, region_pretty = case_when( region == "us" ~ "USA", region == "developed" ~ "Developed Ex. USA", region == "emerging" ~ "Emerging" ), region_pretty = region_pretty %>% factor(levels = c("USA", "Developed Ex. USA", "Emerging")) ) %>% ggplot(aes(reorder(hcl_label, sort_var), repl_rate*100, fill = hcl_label)) + geom_col() + labs(y = "Replication Rate (%)") + facet_wrap(~region_pretty, ncol = 1) + theme( legend.position = "none", axis.title.x = element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1) ) } # Figure 1 - Waterfall Graph plot_lit_comp <- function(eb_us, mt_res, eb_world, excl_insig=T) { if (excl_insig) { sig_group <- T } else { sig_group <- c(T, F) } raw_reg <- eb_us$input$long %>% group_by(characteristic) %>% nest() %>% mutate( raw_reg = data %>% map(~ lm(ret ~ 1, data = .x)), tidied = raw_reg %>% map(tidy) ) %>% unnest(tidied) %>% ungroup() raw_overall <- raw_reg %>% summarise(repl_rate = mean(p.value < 0.05 & estimate > 0)) %>% pull(repl_rate) raw_sig <- raw_reg %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% filter(orig_sig == T) %>% summarise(repl_rate = mean(p.value < 0.05 & estimate > 0)) %>% pull(repl_rate) capm <- mt_res %>% filter(region == "us" & method %in% c("BY", "OLS")) %>% mutate(characteristic = char_reg %>% str_remove("__.+")) %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% filter(orig_sig %in% sig_group) %>% group_by(method) %>% summarise(repl_rate = mean(p < 0.05 & estimate > 0)) eb_us_repl <- eb_us$factors %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% filter(orig_sig %in% sig_group) %>% summarise(repl_rate = mean(p025 > 0)) %>% pull(repl_rate) eb_global_repl <- eb_world$factors %>% ungroup() %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% filter(orig_sig %in% sig_group) %>% summarise(repl_rate = mean(post_mean - 1.96 * post_sd > 0)) %>% pull(repl_rate) # Waterfall Graph litterature_comp <- tribble( ~ type, ~repl_rate, "hxz", 0.35, "raw", raw_overall, "raw_sig", raw_sig, "alpha", capm %>% filter(method == "OLS") %>% pull(repl_rate), "mt", capm %>% filter(method == "BY") %>% pull(repl_rate), "eb_us", eb_us_repl, "eb_global", eb_global_repl ) if (excl_insig == F) { litterature_comp <- litterature_comp %>% filter(type != "raw_sig") } litterature_comp <- litterature_comp %>% mutate( repl_rate = repl_rate * 100, type = type %>% factor(levels = c("hxz", "raw", "raw_sig", "alpha", "mt", "eb_us", "eb_global")), prev_repl_rate = dplyr::lag(repl_rate, default = 0), impact = if_else(repl_rate > prev_repl_rate, "Increase", "Decrease"), impact = impact %>% factor(levels = c("Increase", "Decrease")) ) %>% setDT() w <- 0.3 #use to set width of bars l1 <- -3 inc <- -3 col_top <- "black" # colours_theme[2] col_bot <- "black" # colours_theme[1] type <- litterature_comp$type plot <- litterature_comp %>% ggplot(aes(xmin = as.integer(type) - w, xmax = as.integer(type) + w, ymin = prev_repl_rate, ymax = repl_rate, fill = impact)) + geom_rect() + geom_segment(data = litterature_comp[1:(.N - 1)], aes(x = as.integer(type) + w, xend = as.integer(type) + w + 1, y = repl_rate, yend = repl_rate)) + scale_x_discrete(limits = type) + geom_text(aes(x = as.integer(type), y = repl_rate + if_else(sign(repl_rate - prev_repl_rate)!=0, sign(repl_rate - prev_repl_rate), 1) * 2, label = str_c(formatC(round(repl_rate, 1), digits = 1, format = "f"), "%"))) + scale_fill_manual(values = (c("Increase" = colours_theme[1], "Decrease" = colours_theme[2]))) + labs(x = "Implementation", y = "Replication Rate (%)") + coord_cartesian(ylim = c(0, 90), expand = FALSE, clip = "off") + # HXZ annotate(geom = "text", x = "hxz", y = l1, label = "Hou, Xue, and", colour = col_top, fontface = 2) + annotate(geom = "text", x = "hxz", y = l1 + inc*1, label = "Zhang (2020)", colour = col_top, fontface = 2) + annotate(geom = "text", x = "hxz", y = l1 + inc*2, label = "Raw returns", colour = col_bot, fontface = 1) + # Our Raw annotate(geom = "text", x = "raw", y = l1, label = "Our sample", colour = col_top, fontface = 2) + annotate(geom = "text", x = "raw", y = l1 + inc*1, label = "Raw returns,", colour = col_bot) + annotate(geom = "text", x = "raw", y = l1 + inc*2, label = "our methodology", colour = col_bot) + # Our Alpha annotate(geom = "text", x = "alpha", y = l1, label = "Our sample", colour = col_top, fontface = 2) + annotate(geom = "text", x = "alpha", y = l1 + inc*1, label = "CAPM alphas", colour = col_bot) + # Our MT annotate(geom = "text", x = "mt", y = l1, label = "Harvey, Liu, and", colour = col_top, fontface = 2) + annotate(geom = "text", x = "mt", y = l1 + inc*1, label = "Zhu (2016)", colour = col_bot, fontface = 2) + annotate(geom = "text", x = "mt", y = l1 + inc*2, label = "Multiple testing", colour = col_bot) + annotate(geom = "text", x = "mt", y = l1 + inc*3, label = "adjustment", colour = col_bot) + # Our EB-US annotate(geom = "text", x = "eb_us", y = l1, label = "Our Bayesian", colour = col_top, fontface = 2) + annotate(geom = "text", x = "eb_us", y = l1 + inc*1, label = "estimation", colour = col_top, fontface = 2) + annotate(geom = "text", x = "eb_us", y = l1 + inc*2, label = "US data", colour = col_bot) + # Our EB-US annotate(geom = "text", x = "eb_global", y = l1, label = "Our Bayesian", colour = col_top, fontface = 2) + annotate(geom = "text", x = "eb_global", y = l1 + inc*1, label = "estimation", colour = col_top, fontface = 2) + annotate(geom = "text", x = "eb_global", y = l1 + inc*2, label = "Global data", colour = col_bot) + theme( legend.title = element_blank(), plot.margin = unit(c(1, 1, 4, 1), "lines"), axis.title.x = element_blank(), axis.text.x = element_blank(), panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank()) if (excl_insig) { # Our Raw Significant only plot <- plot + annotate(geom = "text", x = "raw_sig", y = l1, label = "Our sample", colour = col_top, fontface = 2) + annotate(geom = "text", x = "raw_sig", y = l1 + inc*1, label = "Excl. factors", colour = col_bot) + annotate(geom = "text", x = "raw_sig", y = l1 + inc*2, label = "never significant", colour = col_bot) } return(plot) } plot_many_factors <- function() { # The Power of Many Factors many_factors_se <- eb_est$us$input$long %>% select("char_reg" = name_wide, ret_neu_scaled, mkt_vw_exc) %>% mutate( region = char_reg %>% str_extract(pattern = "(?<=_{2}).+"), region = case_when( region == "us" ~ "USA", region == "developed" ~ "Developed Ex. USA", region == "emerging" ~ "Emerging" ), region = region %>% factor(levels = c("USA", "Developed Ex. USA", "Emerging")) ) %>% filter(!is.na(ret_neu_scaled)) %>% group_by(char_reg, region) %>% nest() %>% mutate( fit = data %>% map(~lm(ret_neu_scaled ~ mkt_vw_exc, data = .x)), # They are market neutral by construction but this takes care of the degress of freedom adjustment nw = fit %>% map(~ lmtest::coeftest(.x, vcov = sandwich::NeweyWest(.x)) %>% broom::tidy()), df = fit %>% map_dbl(~ .x$df.residual) ) %>% unnest(nw) %>% filter(term == "(Intercept)") %>% rename("p_ols" = p.value) %>% # group_by(region) %>% ungroup() %>% mutate( n = n(), p_bonf = p_ols %>% p.adjust(method = "bonferroni"), p_holm = p_ols %>% p.adjust(method = "holm"), p_bh = p_ols %>% p.adjust(method = "BH"), p_by = p_ols %>% p.adjust(method = "BY") ) %>% select(n, region, char_reg, estimate, statistic, df, "se" = std.error, starts_with("p_")) %>% gather(starts_with("p_"), key = "method", value = "p") %>% mutate( method = method %>% str_remove("^p_"), mt_adj = case_when( method == "ols" ~ "None", method == "bh" ~ "FDR", method == "by" ~ "FDR", method == "bonf" ~ "FWR", method == "holm" ~ "FWR" ), method = case_when( method == "ols" ~ "OLS", method == "bh" ~ "BH", method == "by" ~ "Benjamini-Yekutieli", method == "bonf" ~ "Bonferroni", method == "holm" ~ "Holm", TRUE ~ method ), method = method %>% factor(levels = c("OLS", "Bonferroni", "Holm", "BH", "Benjamini-Yekutieli", "EB - Region", "EB - Full")) ) (mf_t <- many_factors_se %>% group_by(method) %>% summarise( t_140 = (min(abs(statistic)[p < 0.05]) + max(abs(statistic)[p > 0.05])) / 2 )) avg_se <- mean(many_factors_se$se) ols_ci <- tibble( method = c("OLS", "Bonferroni", "Benjamini-Yekutieli", "Empirical Bayes"), t_1 = rep(1.96, 4) ) %>% left_join(mf_t, by = "method") %>% gather(t_1, t_140, key = "n_factors", value = "t") %>% mutate( n_factors = n_factors %>% str_remove("t_") %>% as.integer(), p025 = -t * avg_se, p975 = t * avg_se ) %>% filter(!(method == "Empirical Bayes" & n_factors == 140)) eb_ci <- eb_est$us$factors %>% summarise( eb_se = mean(post_sd) ) %>% transmute( method = "Empirical Bayes", n_factors = 140, p025 = -1.96 * eb_se, p975 = 1.96 * eb_se ) comb_data <- bind_rows(ols_ci, eb_ci) comb_data %>% ggplot(aes(n_factors, colour = method, linetype = method)) + geom_line(aes(y = p025)) + geom_line(aes(y = p975)) + geom_ribbon(data=comb_data %>% filter(method == "Empirical Bayes"), aes(x = n_factors, ymin=p025,ymax=p975), fill=colours_theme[3], alpha=0.2, inherit.aes = F) + labs(x = "Number of Factors", y = "Centered 95% Confidence Interval", colour = "Method", linetype = "Method") + scale_y_continuous(breaks=c(avg_se * 1.96, 0, -avg_se * 1.96), labels=c(expression(hat(alpha) + sigma[hat(alpha)] %*% t), expression(hat(alpha)), expression(hat(alpha) - sigma[hat(alpha)] %*% t))) + scale_x_continuous(breaks = c(1, 140), expand=expansion(mult = c(0, 0.3), add = c(5, 0))) + geom_dl(aes(label = method, y = p975), method = list(dl.trans(x = x + 0.2), "last.points", cex = 1)) + theme( text = element_text(size = 12), axis.title.y = element_blank(), legend.position = "none" ) } # Plot In-Sample vs. Out of Sample plot_is_oos <- function(ub_us, ub_dev, ub_emer) { is_oos_split <- ub_us$ols_regs %>% mutate(region = "us") %>% bind_rows( ub_dev$ols_regs %>% mutate(region = "dev"), ub_emer$ols_regs %>% mutate(region = "emer") ) %>% mutate( region = case_when( region == "us" ~ "USA", region == "dev" ~ "Developed", region == "emer" ~ "Emerging" ), region = region %>% factor(levels = c("USA", "Developed", "Emerging")) ) %>% rename("characteristic" = c) cluster_plot <- is_oos_split %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% filter(orig_sig == 1) %>% left_join(cluster_labels, by = "characteristic") %>% gather(is, post_sample, key = "period", value = "estimate") %>% group_by(region, hcl_label, period) %>% summarise( alpha_mean = mean(estimate) ) %>% select(region, hcl_label, period, alpha_mean) %>% spread(key = period, value = alpha_mean) %>% mutate(nudge_y = -0.015) %>% ggplot(aes(is, post_sample)) + geom_point() + ggrepel::geom_text_repel(aes(label = hcl_label), nudge_y = -0.015) + geom_abline(intercept = 0, slope = 1, linetype = "dotted") + # geom_smooth(method = "lm", se = F) + facet_wrap(~region) + labs(x = "Monthly Alpha (%): In-Sample", y = "Monthly Alpha (%): Post Sample") is_oos_split %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% filter(orig_sig == 1) %>% ggplot(aes(is, post_sample)) + geom_point() + geom_abline(intercept = 0, slope = 1, linetype = "dotted") + geom_smooth(method = "lm", se = F, formula = "y ~ x") + facet_wrap(~region, ncol = 1) + labs(x = "Monthly Alpha (%): In-Sample", y = "Monthly Alpha (%): Post Sample") } plot_is_oos_factors <- function(is_oos_regions) { is_oos_data <- is_oos_regions %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% filter(orig_sig == 1) %>% mutate( region = case_when( region == "us" ~ "USA", region == "developed" ~ "Developed Ex. USA", region == "emerging" ~ "Emerging" ), region = region %>% factor(levels = c("USA", "Developed Ex. USA", "Emerging")) ) %>% select(region, characteristic, period, estimate) %>% spread(key = period, value = estimate) is_oos_data %>% group_by(region) %>% nest() %>% mutate( fit = data %>% map(~lm(oos ~ is, data = .x)), n = fit %>% map_dbl(~length(.x$residuals)), tidied = fit %>% map(tidy) ) %>% unnest(tidied) is_oos_data %>% ggplot(aes(is, oos)) + geom_point() + geom_smooth(method = "lm", se = F, formula = "y ~ x") + facet_wrap(~region) + geom_hline(yintercept = 0) + geom_vline(xintercept = 0) + coord_fixed() + labs(x = "Monthly Alpha (%): In-Sample", y = "Monthly Alpha (%): Out-of-Sample") } plot_tpf <- function(tpf, cluster_order, ci_low = 0.05, ci_high = 0.95) { orig <- tpf %>% filter(id == "Apparent") %>% select(coef) %>% unnest(coef) %>% rename("tpf_weight" = weight) bs <- tpf %>% filter(id != "Apparent") %>% unnest(coef) %>% group_by(term) %>% summarise( bs_mean = mean(weight), bs_sd = sd(weight), bs_se = bs_sd / sqrt(n()), bs_low = weight %>% quantile(ci_low), bs_high = weight %>% quantile(ci_high), bs_prob_zero = mean(weight == 0) ) %>% left_join(orig, by = "term") %>% mutate(bs_bias = bs_mean - tpf_weight) print(paste0("Clusters with significantly positive TPF weight: ", sum(filter(bs, term != "Market")$bs_low>0))) bs %>% mutate( term = term %>% factor(levels = c(cluster_order, "Market")) ) %>% ggplot(aes(reorder(term, tpf_weight), tpf_weight*100, fill = term)) + geom_col() + coord_flip() + geom_errorbar(mapping = aes(ymin = bs_low*100, ymax = bs_high*100), width = 0.2, size = 0.2) + labs(y = "Weight in Tangency PF (%)") + theme( axis.title.y = element_blank(), legend.position = "none" ) } plot_tpf_region <- function(tpf_us, tpf_dev, tpf_emer, cluster_order, ci_low = 0.05, ci_high = 0.95) { all <- bind_rows(tpf_us, tpf_dev, tpf_emer) orig <- all %>% filter(id == "Apparent") %>% select(market_region, coef) %>% unnest(coef) %>% rename("tpf_weight" = weight) bs <- all %>% filter(id != "Apparent") %>% unnest(coef) %>% group_by(market_region, term) %>% summarise( bs_mean = mean(weight), bs_sd = sd(weight), bs_se = bs_sd / sqrt(n()), bs_low = weight %>% quantile(ci_low), bs_high = weight %>% quantile(ci_high), bs_prob_zero = mean(weight == 0) ) %>% left_join(orig, by = c("market_region", "term")) %>% mutate(bs_bias = bs_mean - tpf_weight) bs %>% group_by(term) %>% mutate( sort_var = tpf_weight[market_region == "us"], region_pretty = case_when( market_region == "us" ~ "USA", market_region == "developed" ~ "Developed Ex. USA", market_region == "emerging" ~ "Emerging" ), region_pretty = region_pretty %>% factor(levels = c("USA", "Developed Ex. USA", "Emerging")) ) %>% mutate(term = term %>% factor(levels = c(cluster_order, "Market"))) %>% ggplot(aes(reorder(term, sort_var), tpf_weight*100, fill = term)) + geom_col() + geom_errorbar(mapping = aes(ymin = bs_low*100, ymax = bs_high*100), width = 0.2, size = 0.2) + labs(y = "Weight in Tangency PF (%)") + theme( axis.title.x = element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1), legend.position = "none" ) + facet_wrap(~region_pretty, ncol = 1) } plot_tpf_size <- function(tpf_size_samples, cluster_order, ci_low = 0.05, ci_high = 0.95) { orig <- tpf_size_samples %>% filter(id == "Apparent") %>% select(size_grp, coef) %>% unnest(coef) %>% rename("tpf_weight" = weight) bs <- tpf_size_samples %>% filter(id != "Apparent") %>% unnest(coef) %>% group_by(size_grp, term) %>% summarise( bs_mean = mean(weight), bs_sd = sd(weight), bs_se = bs_sd / sqrt(n()), bs_low = weight %>% quantile(ci_low), bs_high = weight %>% quantile(ci_high), bs_prob_zero = mean(weight == 0) ) %>% left_join(orig, by = c("size_grp", "term")) %>% mutate(bs_bias = bs_mean - tpf_weight) bs %>% group_by(term) %>% mutate( sort_var = tpf_weight[size_grp == "mega"], size_grp_pretty = size_grp %>% str_to_title(), size_grp_pretty = size_grp_pretty %>% factor(levels = c("Mega", "Large", "Small", "Micro", "Nano")) ) %>% mutate(term = term %>% factor(levels = c(cluster_order, "Market"))) %>% ggplot(aes(reorder(term, sort_var), tpf_weight*100, fill = term)) + geom_col() + geom_errorbar(mapping = aes(ymin = bs_low*100, ymax = bs_high*100), width = 0.2, size = 0.2) + labs(y = "Weight in Tangency PF (%)") + theme( axis.title.x = element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1), legend.position = "none" ) + facet_wrap(~size_grp_pretty, ncol = 1) } plot_over_time <- function(posterior_over_time, orig_sig, ols_incl, lb, bw) { if (orig_sig) { orig_sig_values <- T } else { orig_sig_values <- c(T, F) } all_factors <- tibble("char_reg" = rownames(posterior_over_time[[1]]$factor_mean)) %>% mutate(characteristic = char_reg %>% str_remove(paste0("__", ot_region))) %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% mutate( selected_factors = (orig_sig %in% orig_sig_values) ) i <- all_factors$selected_factors full_posterior <- posterior_over_time %>% lapply(function(eb_act) { a <- eb_act$factor_mean[i] a_cov <- eb_act$factor_cov[i, i] n <- length(a) w <- rep(1/n, n) post_mean <- drop(t(a) %*% w) post_sd <- drop(sqrt(t(w) %*% a_cov %*% w)) avg_ols <- mean(eb_act$factors$ols_est[i]) tibble("end_date"= eb_act$end_date, n=n, post_mean, post_sd, avg_ols) }) %>% bind_rows() # Black and white coloring if (bw) { col1 <- "black" col2 <- "grey35" } else { col1 <- colours_theme[1] col2 <- colours_theme[2] } if (ols_incl) { # Create OLS benchmarks ols_bm <- seq.Date(from = as.Date("1959-12-31"), to = settings$end_date, by = "1 year") %>% lapply(function(end_date) { data <- regional_pfs[region == ot_region] %>% filter(characteristic %in% all_factors$characteristic[i]) %>% filter(eom >= settings$start_date & eom <= end_date) %>% eb_prepare( scale_alpha = settings$eb$scale_alpha, overlapping = settings$eb$overlapping ) avg_alpha_full <- data$long %>% group_by(characteristic) %>% summarise( n = n(), alpha = mean(ret_neu_scaled) ) %>% ungroup() %>% summarise( end_date = end_date, type = "avg_alpha_full", alpha = mean(alpha) ) avg_alpha_st <- data$long %>% filter(year(eom) > (year(end_date)-lb)) %>% group_by(characteristic) %>% mutate( ret_neu_st = (ret - cov(ret, mkt_vw_exc)/var(mkt_vw_exc) * mkt_vw_exc)*100, ret_neu_st = ret_neu_st / sd(ret_neu_st) * (10 / sqrt(12)) ) %>% summarise(alpha = mean(ret_neu_st)) %>% ungroup() %>% summarise( end_date = end_date, type = "avg_alpha_st", alpha = mean(alpha) ) alpha_avg <- data$long %>% group_by(eom) %>% summarise( n = n(), ret = mean(ret_neu_scaled), mkt_vw_exc = unique(mkt_vw_exc) ) %>% ungroup() %>% mutate( ret_neu = ret - cov(ret, mkt_vw_exc) / var(mkt_vw_exc) * mkt_vw_exc ) %>% summarise( end_date = end_date, type = "alpha_avg_full", alpha = mean(ret_neu) ) bind_rows(avg_alpha_full, avg_alpha_st, alpha_avg) }) %>% bind_rows() ols_bm_wide <- ols_bm %>% spread(key = type, value = alpha) (plot_1 <- full_posterior %>% left_join(ols_bm_wide, by = "end_date") %>% ggplot(aes(end_date)) + geom_point(aes(y = post_mean, colour="Average Posterior Alpha", shape = "Average Posterior Alpha")) + geom_point(aes(y = avg_alpha_full, colour="Average OLS Alpha", shape = "Average OLS Alpha")) + geom_errorbar(aes(ymin = post_mean + 1.96 * post_sd, ymax = post_mean - 1.96 * post_sd)) + scale_colour_manual(name = "Test", values = c("Average Posterior Alpha"=col1, "Average OLS Alpha"=col2)) + scale_shape_manual(name = "Test", values = c("Average Posterior Alpha" = 16, "Average OLS Alpha" = 17)) + labs(y = "Posterior Alpha with 95% CI (%)") + ylim(c(0, NA)) + scale_x_date(breaks = seq.Date(as.Date("1960-12-31"), as.Date("2020-12-31"), by = "10 years"), date_labels = "%Y-%m") + theme( legend.title = element_blank(), legend.position = "top", axis.text.x = element_blank(), axis.title.x = element_blank() )) plot_2 <- full_posterior %>% left_join(ols_bm_wide, by = "end_date") %>% ggplot(aes(end_date, avg_alpha_st)) + geom_col() + labs(y = "5-year Rolling Alpha (%)") + scale_x_date(breaks = seq.Date(as.Date("1960-12-31"), as.Date("2020-12-31"), by = "10 years"), date_labels = "%Y-%m") + theme( axis.title.x = element_blank() ) # print(full_posterior %>% left_join(ols_bm_wide, by = "end_date") %>% mutate(diff_pm = post_mean - lag(post_mean), diff_aaf = avg_alpha_full - lag(avg_alpha_full)) %>% filter(end_date != as.Date("1960-12-31")) %>% summarise(sd_pm = sd(diff_pm), sd_aaf = sd(diff_aaf))) plot <- cowplot::plot_grid(plot_1, plot_2, ncol = 1, rel_heights = c(2, 1)) } else { plot <- full_posterior %>% ggplot(aes(end_date)) + geom_point(aes(y = post_mean), colour=col1, shape = 16) + geom_errorbar(aes(ymin = post_mean + 1.96 * post_sd, ymax = post_mean - 1.96 * post_sd)) + labs(y = "Posterior Alpha with 95% CI (%)") + ylim(c(0, NA)) + scale_x_date(breaks = seq.Date(as.Date("1960-12-31"), as.Date("2020-12-31"), by = "10 years"), date_labels = "%Y-%m") + theme( legend.title = element_blank(), legend.position = "top", axis.title.x = element_blank() ) } print(full_posterior %>% mutate(ci_width = post_sd*1.96*2) %>% filter(end_date %in% c(as.Date("1960-12-31"), settings$end_date))) plot } plot_taus_over_time <- function(posterior_over_time_flex) { data <- posterior_over_time_flex %>% lapply(function(x) { x$mle %>% mutate(end_date = x$end_date) }) %>% bind_rows() %>% filter(estimate != "alpha") %>% mutate(estimate_pretty = if_else(estimate == "tau_s", "tau_w", estimate)) ymax <- max(data$ml_est) data %>% ggplot(aes(end_date, ml_est, colour = estimate_pretty, linetype = estimate_pretty)) + geom_line() + scale_linetype_manual(values = c('tau_c' = "solid", 'tau_w' = "longdash"), name = '', labels = c(expression(tau[c]), expression(tau[w]))) + scale_colour_manual(values = c('tau_c' = colours_theme[1], 'tau_w' = colours_theme[2]), name = '', labels = c(expression(tau[c]), expression(tau[w]))) + scale_x_date(breaks = seq.Date(as.Date("1960-12-31"), as.Date("2020-12-31"), by = "10 years"), date_labels = "%Y-%m") + labs(y = "Maximum Likelihood Estimate (%)") + ylim(c(0, ymax)) + theme( legend.title = element_blank(), legend.position = "top", axis.title.x = element_blank() ) } plot_sim_fdr <- function(simulation) { tau_w_names <- c( `0.01` = expression(tau[w] ~ "= 0.01%"), `0.2` = expression(tau[w] ~ "= 0.20%") ) stat_labels <- c( `False Discovery Rate` = expression(~"False Discovery Rate"), `True Discovery Rate` = expression(~"True Discovery Rate"), `True Discoveries` = expression(~"True Discoveries"), `False Discoveries` = expression(~"False Discoveries") ) plot_data <- simulation %>% gather(n_disc, fdr, true_disc, false_disc, true_disc_rate, key = "stat", value = "number") %>% filter(stat %in% c("fdr", "true_disc", "false_disc", "true_disc_rate")) %>% mutate( stat = case_when( stat == "fdr" ~ "False Discovery Rate", stat == "true_disc" ~ "True Discoveries", stat == "false_disc" ~ "False Discoveries", stat == "true_disc_rate" ~ "True Discovery Rate" ), stat = stat %>% factor(levels = c("False Discovery Rate", "True Discovery Rate", "True Discoveries", "False Discoveries")), type = case_when( type == "by" ~ "Benjamini and Yekutieli", type == "ols" ~ "OLS", type == "eb" ~ "Empirical Bayes" ), type = type %>% factor(levels = c("OLS", "Benjamini and Yekutieli", "Empirical Bayes")), # tau_w_title = formatC(tau_w, digits = 2, format = "f"), # tau_w_title = as.character(eval(bquote(tau[w] ~ "=" ~ .(tau_w_title)~ "%"))) tau_w_title = tau_w %>% factor(labels = tau_w_names), stat_title = stat %>% factor(label = stat_labels) ) fdr_plot <- plot_data %>% filter(stat == "False Discovery Rate") %>% ggplot(aes(tau_c, number, colour = type)) + geom_point() + geom_line() + labs(x = "tau_c (%)", y = "False Discovery Rate", colour = "Adjustment") + facet_wrap(stat~tau_w_title, labeller = label_bquote(tau[w] ~ "=" ~ .(tau_w_title)~ "%")) true_disc_rate <- plot_data %>% filter(stat == "True Discovery Rate") %>% ggplot(aes(tau_c, number, colour = type)) + geom_point() + geom_line() + labs(x = "tau_c (%)", y = "True Discovery Rate", colour = "Adjustment") + facet_wrap(stat~tau_w_title, ncol = 2) plot_data %>% filter(stat %in% c("False Discovery Rate", "True Discovery Rate")) %>% group_by(stat) %>% mutate(scale_max = max(number)) %>% mutate(scale_min = min(number)) %>% ggplot(aes(tau_c, number, colour = type, shape = type)) + geom_point() + geom_point(aes(y = scale_max), alpha = 0) + geom_point(aes(y = scale_min), alpha = 0) + geom_line() + labs(x = bquote(bold(tau[c])~"(%)"), colour = "Type:", linetype = "Type:", shape = "Type:") + facet_wrap(stat_title~tau_w_title, scales = "free_y", labeller = label_parsed) + # facet_wrap(stat_title~tau_w_title) + # facet_wrap(stat~tau_w_title, scales = "free_y", labeller = label_bquote(tau[w] ~ "=" ~ .(tau_w_title)~ "%")) + theme( axis.title.y = element_blank(), axis.title.x = element_text(size = 12), legend.position = "top" ) } plot_size_overall <- function(eb_size, flipped = F, text = F) { size_repl <- eb_size %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% filter(orig_sig == 1) %>% group_by(size_grp) %>% summarise( repl_rate = mean(p025>0) ) if (flipped) { size_plot <- size_repl %>% mutate(size_grp = size_grp %>% factor(levels = c("Nano", "Micro", "Small", "Large", "Mega"))) %>% ggplot(aes(x = size_grp, y = repl_rate*100)) + geom_col(fill = colours_theme[1]) + coord_flip() + labs(y = "Replication Rate (%)") + theme(axis.title.y = element_blank()) if (text) { size_plot <- size_plot + geom_text(aes(label = str_c(formatC(round(repl_rate * 100, 1), digits = 1, format = "f"), "%")), nudge_y = 7, size = 5.5) } } else { size_plot <- size_repl %>% ggplot(aes(x = size_grp, y = repl_rate*100)) + geom_col(fill = colours_theme[1]) + labs(y = "Replication Rate (%)") + theme(axis.title.x = element_blank()) if (text) { size_plot <- size_plot + geom_text(aes(label = str_c(formatC(round(repl_rate * 100, 1), digits = 1, format = "f"), "%")), nudge_y = 2.5, size = 4) } } size_plot } plot_size_clusters <- function(eb_size, cluster_order) { overall <- eb_size %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% filter(orig_sig == 1) %>% group_by(size_grp) %>% summarise( overall_rr = mean(p025>0) ) cluster_rr <- eb_size %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% filter(orig_sig == 1) %>% mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>% group_by(size_grp, hcl_label) %>% summarise( repl_rate = mean(p025>0) ) %>% group_by(hcl_label) %>% mutate(sort_var = repl_rate[size_grp == "Mega"]) %>% left_join(overall, by = "size_grp") %>% mutate(size_title = str_c(size_grp, " - Replication Rate: ", formatC(round(overall_rr * 100, 1), digits = 1, format = "f"), "%")) titles <- cluster_rr$size_title %>% unique() title_order <- c(titles[str_detect(titles, "Mega")], titles[str_detect(titles, "Large")], titles[str_detect(titles, "Small")], titles[str_detect(titles, "Micro")], titles[str_detect(titles, "Nano")]) cluster_rr %>% mutate(size_title = size_title %>% factor(levels = title_order)) %>% ggplot(aes(x = reorder(hcl_label, sort_var), y = repl_rate*100, fill = hcl_label)) + geom_col() + labs(y = "Replication Rate (%)") + facet_wrap(~size_title, ncol = 1) + theme( axis.title.x = element_blank(), legend.position = "none", axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 10) ) } plot_sign_test <- function(sign_test) { sig <- sign_test %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% mutate( type = case_when( p <= 0.05 & orig_sig == 1 ~ "Replicated", p > 0.05 & orig_sig == 1 ~ "Not Replicated", orig_sig == 0 ~ "Never Significant" ), type = type %>% factor(levels = c("Replicated", "Not Replicated", "Never Significant")) # p_value = if_else(p < 0.05, "Significant", "Insignificant"), # p_value = p_value %>% factor(levels = c("Significant", "Insignificant")) ) sig_overall <- sig %>% summarise(repl_rate = sum(type == "Replicated") / sum(type %in% c("Replicated", "Not Replicated"))) %>% pull(repl_rate) plot_sign_factors <- sig %>% ggplot(aes(reorder(characteristic, pos_act), pos_act*100, fill = type)) + geom_col() + ylim(c(0, 100)) + labs(y = "Countries with Positive Return (%)", fill = "Bootstrapped p-Value:") + geom_text(aes(x = 18, y = 100, label = str_c("Replication Rate: ", round(sig_overall*100, 2), "%")), inherit.aes = F) + # geom_text(aes(label = round(p, 2)), nudge_y = 1, size = 1.5) + theme( axis.title.x = element_blank(), # legend.title = element_blank(), legend.position = "top", axis.text.x = element_text(size = 7, angle = 90, vjust = 0, hjust = 1), text = element_text(size = 10) ) plot_pos <- sig %>% left_join(cluster_labels, by = "characteristic") %>% mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>% group_by(hcl_label) %>% filter(type %in% c("Replicated", "Not Replicated")) %>% summarise( repl_rate = sum(type == "Replicated") / sum(type %in% c("Replicated", "Not Replicated")), pos = mean(pos_act) ) %>% ggplot(aes(reorder(hcl_label, pos), pos*100, fill = hcl_label)) + geom_col() + labs(y = "Countries with Positive Alpha (%)") + ylim((c(0, 100))) + theme( legend.position = "none", axis.title.y = element_text(size=8), axis.title.x = element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1) ) plot_sig <- sig %>% left_join(cluster_labels, by = "characteristic") %>% mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>% group_by(hcl_label) %>% summarise(repl_rate = sum(type == "Replicated") / sum(type %in% c("Replicated", "Not Replicated"))) %>% ggplot(aes(reorder(hcl_label, repl_rate), repl_rate*100, fill = hcl_label)) + geom_col() + labs(y = "Sign Test Replication Rate (%)") + theme( legend.position = "none", axis.title.y = element_text(size=8), axis.title.x = element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1) ) plot_sign_clusters <- cowplot::plot_grid(plot_pos, plot_sig, ncol = 1, labels = c("A", "B"), label_y = 1, label_x = 0) list("factors" = plot_sign_factors, "clusters" = plot_sign_clusters) } # World ex us versus us plot_int_cor <- function(eb_us, eb_world_ex_us) { cor_data <- eb_us$input$long %>% bind_rows(eb_world_ex_us$input$long) %>% select(characteristic, region, eom, ret) %>% spread(key = region, value = ret) %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% filter(!is.na(us) & !is.na(world_ex_us)) %>% group_by(characteristic) %>% summarise( monhts = n(), cor = cor(us, world_ex_us) ) print(cor_data %>% pull(cor) %>% quantile()) cor_data %>% left_join(cluster_labels, by = "characteristic") %>% group_by(hcl_label) %>% summarise( cor_avg = mean(cor) ) %>% ggplot(aes(reorder(hcl_label, cor_avg), cor_avg)) + geom_col(fill = colours_theme[1]) + labs(y = "Correlation of US and World ex. US factor (Avg. within Cluster)") + coord_flip() + theme( axis.title.y = element_blank() ) } plot_world_vs_us <- function(eb_us, eb_world_ex_us) { cor_data <- eb_us$input$long %>% bind_rows(eb_world_ex_us$input$long) %>% select(characteristic, region, eom, ret) %>% spread(key = region, value = ret) %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% filter(!is.na(us) & !is.na(world_ex_us)) %>% group_by(characteristic) %>% summarise( monhts = n(), cor = cor(us, world_ex_us) ) print(cor_data %>% pull(cor) %>% quantile()) region_data <- eb_us$factors %>% bind_rows(eb_world_ex_us$factors) %>% select(characteristic, region, ols_est) %>% spread(key = region, value = ols_est) %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") max_scale <- max(c(region_data$world_ex_us, region_data$us)) min_scale <- min(c(region_data$world_ex_us, region_data$us)) fit_all <- lm(world_ex_us ~ us, data = region_data) int <- fit_all$coefficients[1] slp <- fit_all$coefficients[2] r2 <- summary(fit_all)$r.squared eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(R)^2~"="~r2, list(a = format(unname(int), digits = 2), b = format(unname(slp), digits = 2), r2 = format(r2, digits = 3))) lbl <- as.character(as.expression(eq)) t_int <- formatC(round(summary(fit_all)$coefficients["(Intercept)", "t value"], 2), format='f', digits=2) t_us <- formatC(round(summary(fit_all)$coefficients["us", "t value"], 2), format='f', digits=2) region_data %>% mutate( orig_sig_pretty = if_else(orig_sig == 1, "Studied", "Not Studied"), orig_sig_pretty = orig_sig_pretty %>% factor(levels = c("Studied", "Not Studied")) ) %>% ggplot(aes(us, world_ex_us, colour = orig_sig_pretty, shape = orig_sig_pretty)) + geom_point() + geom_abline(intercept = 0, slope = 1, linetype = "dotted") + # geom_abline(intercept = int, slope = slp) + xlim(c(min_scale, max_scale)) + ylim(c(min_scale, max_scale)) + annotate("text", label=lbl, parse=TRUE, x=min_scale, y=max_scale, hjust = 0) + annotate("text", label=paste0(" (", t_int, ") (", t_us, ")"), parse=F, x=min_scale, y=max_scale-0.07, hjust = 0, size = 3.3) + labs(x = "US Alpha (%)", y = "World Ex. US Alpha (%)") + theme( legend.position = "none", legend.title = element_blank() ) } plot_is_oos_post <- function(is_oos, type) { # type in c("GLS", "OLS") plot_list <- c("pre", "post", "pre_post") %>% lapply(function(period) { data <- is_oos[[period]]$regs if (type == "OLS") { fit <- lm(oos ~ is, data = data) # Output int <- unname(fit$coefficients[1]) int_se <- summary(fit)$coefficients[1, "Std. Error"] slope <- unname(fit$coefficients[2]) slope_se <- summary(fit)$coefficients[2, "Std. Error"] r2 <- summary(fit)$adj.r.squared # Adjusted R2 # Label eq_lbl <- substitute( italic(y) == a + b %.% italic(x)*","~~italic(R)^2~"="~r2, list( a = formatC(int, digits = 2, format = "f"), b = formatC(slope, digits = 2, format = "f"), r2 = formatC(r2, digits = 2, format = "f")) ) } if (type == "GLS") { x <- cbind(rep(1, nrow(data)), data$is) y <- data$oos chars <- str_c(data$characteristic, "__us") gls_cov <- eb_est$us$factor_cov[chars, chars] gls_est <- solve(t(x) %*% solve(gls_cov) %*% x) %*% t(x) %*% solve(gls_cov) %*% y gls_res <- y - x %*% gls_est gls_e_var <- 1/(nrow(x)-ncol(x)) * t(gls_res) %*% solve(gls_cov) %*% (gls_res) gls_se <- sqrt(diag(drop(gls_e_var)* solve(t(x) %*% solve(gls_cov) %*% x))) # Output int <- gls_est[1, 1] int_se <- gls_se[1] slope <- gls_est[2, 1] slope_se <- gls_se[2] # Label (R2 doesn't really translate to GLS, because the mean prediction is no longer a good baseline) eq_lbl <- substitute( italic(y) == a + b %.% italic(x), list( a = formatC(int, digits = 2, format = "f"), b = formatC(slope, digits = 2, format = "f")) ) } min_y <- min(data$is, data$oos) max_y <- max(data$is, data$oos) t_int <- formatC(round(int/int_se, 2), format='f', digits=2) t_is <- formatC(round(slope/slope_se, 2), format='f', digits=2) t_lbl <- paste0(" (", t_int, ") (", t_is, ")") data %>% ggplot(aes(is, oos)) + geom_point(colour = colours_theme[1]) + ylim(c(min_y, max_y)) + xlim(c(min_y, max_y)) + geom_hline(yintercept = 0, linetype = "solid") + geom_vline(xintercept = 0, linetype = "solid") + geom_abline(slope = 1, intercept = 0, linetype = "dotted") + ggtitle(label = eq_lbl, subtitle = t_lbl) + labs(x = "In-Sample", y = "Out-of-Sample") }) plot_list } plot_is_oos_post_quad <- function(is_oos, type) { # type in c("OLS", "GLS") plot_list <- c("pre", "post", "pre_post") %>% lapply(function(period) { data <- is_oos[[period]]$regs if (type == "OLS") { fit <- lm(oos ~ is + I(is^2), data = data) # Output int <- unname(fit$coefficients[1]) int_se <- summary(fit)$coefficients[1, "Std. Error"] is <- unname(fit$coefficients[2]) is_se <- summary(fit)$coefficients[2, "Std. Error"] issq <- unname(fit$coefficients[3]) issq_se <- summary(fit)$coefficients[3, "Std. Error"] r2 <- summary(fit)$adj.r.squared # Adjusted R2 # Label eq_lbl <- substitute( italic(y) == a + b %.% italic(x)* ~ s ~ c %.% italic(x)^2*","~~italic(R)^2~"="~r2, list( a = formatC(int, digits = 2, format = "f"), b = formatC(is, digits = 2, format = "f"), s = ifelse(sign(issq)==1, "+", "-"), c = formatC(unname(abs(issq)), digits = 2, format = "f"), r2 = formatC(r2, digits = 2, format = "f")) ) } if (type == "GLS") { x <- cbind(rep(1, nrow(data)), data$is, data$is^2) y <- data$oos chars <- str_c(data$characteristic, "__us") gls_cov <- eb_est$us$factor_cov[chars, chars] gls_est <- solve(t(x) %*% solve(gls_cov) %*% x) %*% t(x) %*% solve(gls_cov) %*% y gls_res <- y - x %*% gls_est gls_e_var <- 1/(nrow(x)-ncol(x)) * t(gls_res) %*% solve(gls_cov) %*% (gls_res) gls_se <- sqrt(diag(drop(gls_e_var)* solve(t(x) %*% solve(gls_cov) %*% x))) # Output int <- gls_est[1, 1] int_se <- gls_se[1] is <- gls_est[2, 1] is_se <- gls_se[2] issq <- gls_est[3, 1] issq_se <- gls_se[3] # Label eq_lbl <- substitute( italic(y) == a + b %.% italic(x)* ~ s ~ c %.% italic(x)^2, list( a = formatC(int, digits = 2, format = "f"), b = formatC(is, digits = 2, format = "f"), s = ifelse(sign(issq)==1, "+", "-"), c = formatC(unname(abs(issq)), digits = 2, format = "f")) ) } min_y <- min(data$is, data$oos) max_y <- max(data$is, data$oos) t_int <- formatC(round(int/int_se, 2), format='f', digits=2) t_is <- formatC(round(is/is_se, 2), format='f', digits=2) t_issq <- formatC(round(issq/issq_se, 2), format='f', digits=2) t_lbl <- paste0(" (", t_int, ") (", t_is, ") (", t_issq, ")") data %>% ggplot(aes(is, oos)) + geom_point(colour = colours_theme[1]) + ylim(c(min_y, max_y)) + xlim(c(min_y, max_y)) + geom_hline(yintercept = 0, linetype = "solid") + geom_vline(xintercept = 0, linetype = "solid") + geom_abline(slope = 1, intercept = 0, linetype = "dotted") + geom_smooth(method = "loess", span = 1, formula = "y~x") + ggtitle(label = eq_lbl, subtitle = t_lbl) + labs(x = "In-Sample", y = "Out-of-Sample") }) plot_list } # Effect Size Plot plot_effects <- function(type, orig_sig, cluster_order) { # type in c("ols", "eb") if (orig_sig) { orig_sig_values <- T } else { orig_sig_values <- c(T, F) } if (type == "ols") { alpha_est = "ols_est" } if (type == "eb") { alpha_est = "post_mean" } (effect_world <- eb_est$world$factors %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% filter(orig_sig %in% orig_sig_values) %>% mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>% group_by(hcl_label) %>% summarise(mean_alpha = mean(get(alpha_est))) %>% ggplot(aes(reorder(hcl_label, mean_alpha), mean_alpha, fill = hcl_label)) + geom_col() + coord_flip() + labs(y = paste(str_to_upper(type), "Alpha Estimate (%)")) + theme( axis.title.y = element_blank(), legend.position = "none" )) (effect_regions <- eb_est$all$factors %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% filter(orig_sig %in% orig_sig_values) %>% mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>% group_by(region, hcl_label) %>% summarise(mean_alpha = mean(get(alpha_est))) %>% group_by(hcl_label) %>% mutate( sort_var = mean_alpha[region == "us"], region_pretty = case_when( region == "us" ~ "US", region == "developed" ~ "Developed", region == "emerging" ~ "Emerging" ), region_pretty = region_pretty %>% factor(levels = c("US", "Developed", "Emerging")) ) %>% ggplot(aes(reorder(hcl_label, sort_var), mean_alpha, fill = hcl_label)) + geom_col() + coord_flip() + scale_y_continuous(breaks = seq(-0.2, 1, 0.2)) + facet_wrap(~region_pretty, scales = "free_x") + labs(y = paste("Average", str_to_upper(type) ,"Alpha (%)")) + theme( axis.title.y = element_blank(), legend.position = "none" )) (effect_size <- eb_us_size %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% filter(orig_sig %in% orig_sig_values) %>% mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>% group_by(size_grp, hcl_label) %>% summarise(mean_alpha = mean(get(alpha_est))) %>% group_by(hcl_label) %>% mutate(sort_var = mean_alpha[size_grp == "Mega"]) %>% ggplot(aes(reorder(hcl_label, sort_var), mean_alpha, fill = hcl_label)) + geom_col() + coord_flip() + scale_y_continuous(breaks = seq(0, 1.5, 0.50)) + facet_wrap(~size_grp, nrow = 1, scales = "free_x") + labs(y = paste("Average", str_to_upper(type) ,"Alpha (%)")) + theme( axis.title.y = element_blank(), legend.position = "none" )) list(effect_world, effect_regions, effect_size) } # Replication Rate by Cluster plot_repl_cluster <- function(eb_factors, orig_sig, cluster_order) { if (orig_sig) { factor_subset <- eb_factors %>% left_join(char_info %>% select(characteristic, significance), by = "characteristic") %>% filter(significance == T) } else { factor_subset <- copy(eb_factors) } if (uniqueN(factor_subset$region) != 1) { warning("!!!MULTIPLE REGIONS INCLUDED!!!") } factor_subset %>% mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>% group_by(hcl_label) %>% summarise( n = n(), repl_rate = mean(p025 > 0), sort_var = repl_rate + n / 100000 ) %>% ggplot(aes(reorder(hcl_label, sort_var), repl_rate * 100, fill = hcl_label)) + geom_col() + labs(y = "Replication Rate (%)") + coord_flip() + theme( axis.title.y = element_blank(), legend.position = "none" ) } # Plot Replication Rate as a Function of Tau - Benchmark against Harvey et al. (2016) plot_harvey <- function(harvey_base_res, harvey_worst_res, tau_ws, act_rr) { mle_est_base <- harvey_base_res$sim[[1]] %>% lapply(function(x) x$mle) %>% bind_rows() %>% mutate(type = "baseline") mle_est_worst <- harvey_worst_res$sim[[1]] %>% lapply(function(x) x$mle) %>% bind_rows() %>% mutate(type = "worst_case") mle_est <- mle_est_base %>% bind_rows(mle_est_worst) mle_summary <- mle_est %>% group_by(type, coef) %>% summarise( n = n(), coef_mean = mean(mle) ) tc_harvey_base <- mle_summary %>% filter(type == "baseline" & coef == "tc") %>% pull(coef_mean) tc_harvey_worst <- mle_summary %>% filter(type == "worst_case" & coef == "tc") %>% pull(coef_mean) # Replication Rate under alternative Tau's m <- eb_est$us$factors %>% select(characteristic, ols_est, hcl_label) %>% mutate(cm = 1) %>% select(characteristic, hcl_label, cm) %>% spread(key = hcl_label, value = cm) %>% select(-characteristic) %>% as.matrix() m[is.na(m)] <- 0 mm <- m %*% t(m) alpha_hat <- eb_est$us$factors %>% pull(ols_est) alpha_0_vec <- rep(0, 153) sigma <- eb_est$us$sigma tc_act <- eb_est$us$mle %>% filter(estimate == "tau_c") %>% pull(ml_est) %>% round(2) search_grid <- expand.grid( tau_c = c(seq(0.15, 0.46, by = 0.01), tc_act, tc_harvey_base, tc_harvey_worst), tau_w = tau_ws ) repl_by_tau <- 1:nrow(search_grid) %>% lapply(function(i) { tw <- search_grid[i, "tau_w"] tc <- search_grid[i, "tau_c"] omega <- diag(153) * tw^2 + mm * tc^2 post_cov <- solve(solve(omega) + solve(sigma)) post_alpha <- post_cov %*% (solve(omega) %*% alpha_0_vec + solve(sigma) %*% alpha_hat) tibble(characteristic = eb_est$us$factors$characteristic, post_mean = drop(post_alpha), post_sd = sqrt(diag(post_cov))) %>% left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% filter(orig_sig == T) %>% summarise(repl_rate = mean(post_mean - 1.96 * post_sd > 0)) %>% mutate(tau_c = tc, tau_w = tw) }) %>% bind_rows() # Set TW labels n_tw <- length(tau_ws) tau_w_names <- vector(mode = "expression", length = n_tw) for (i in 1:n_tw) { tau_w_names[i] <- c(bquote(bold(tau[w]) ~ "=" ~ .(unname(tau_ws[i])) ~ "%")) } names(tau_w_names) <- tau_ws # Generate Important Points tau_points <- repl_by_tau %>% filter(tau_c %in% c(tc_act, tc_harvey_base, tc_harvey_worst)) %>% distinct() %>% mutate( tau_w_title = tau_w %>% factor(labels = tau_w_names), type = case_when( tau_c == tc_act ~ "Estimated from Data", tau_c == tc_harvey_base ~ "Harvey, Liu, and Zhu (2016): Baseline", tau_c == tc_harvey_worst ~ "Harvey, Liu, and Zhu (2016): Conservative", TRUE ~ "Other" ), type = type %>% factor(levels = c("Harvey, Liu, and Zhu (2016): Conservative", "Harvey, Liu, and Zhu (2016): Baseline", "Estimated from Data")) ) print(tau_points) plot <- repl_by_tau %>% mutate( tau_w_title = tau_w %>% factor(labels = tau_w_names), type = case_when( tau_c == tc_act ~ "Estimated from Data", tau_c == tc_harvey_base ~ "Harvey, Liu, and Zhu (2016): Baseline", tau_c == tc_harvey_worst ~ "Harvey, Liu, and Zhu (2016): Conservative", TRUE ~ " " ), type = type %>% factor(levels = c("Estimated from Data", "Harvey, Liu, and Zhu (2016): Baseline", "Harvey et al. (2016): Conservative", " ")) ) %>% ggplot(aes(tau_c, repl_rate * 100)) + geom_point(data = tau_points, aes(colour = type, shape = type, stroke = 1), size = 3) + geom_line(alpha = 1, size = 0.6) + geom_hline(yintercept = act_rr*100, linetype = "dotted") + scale_x_continuous(breaks = seq(0.05, max(search_grid$tau_c), 0.05)) + ylim(c(0, 100)) + theme(legend.title = element_blank(), legend.position = "top") + labs(y = "Replication Rate (%)", x = bquote(bold(tau[c])~"(%)"), colour = expression(tau[c]), shape = expression(tau[c])) if (n_tw > 1) { plot <- plot + facet_wrap(~tau_w_title, labeller = label_parsed) } return(plot) } # Single Factor TPF -- # Plot TPF Factor: Cluster + Market plot_tpf_one_cluster <- function(data_wide, cluster_labels, s) { mkt_sr <- mean(data_wide$market)/sd(data_wide$market) one_cluster <- unique(cluster_labels$hcl_label) %>% lapply(function(c) { cl_chars <- cluster_labels %>% filter(characteristic %in% colnames(data_wide) & hcl_label == c) %>% pull(characteristic) # Cluster SR cl <- data_wide %>% select(market, all_of(cl_chars)) w <- cl %>% epo_tpf(s = s) sr_all <- cl %>% sr_func(w = w) # Average SR sr_single_avg <- cl_chars %>% sapply(function(char) { cl_sub <- data_wide %>% select(market, all_of(char)) w <- cl_sub %>% epo_tpf(s = opt_s) cl_sub %>% sr_func(w = w) }) %>% mean() tibble(hcl_label=c, sr_all=sr_all, sr_single_avg=sr_single_avg) }) %>% bind_rows() one_cluster %>% mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>% ggplot(aes(reorder(hcl_label, sr_all), sr_all, fill = hcl_label)) + geom_col() + coord_flip() + geom_hline(yintercept = mkt_sr, linetype = "dashed") + theme(legend.position = "none", axis.title.y = element_blank()) + labs(y = "Monthly Sharpe Ratio: Market + Cluster") } # Plot TPF Factor: Exclude one cluster plot_tpf_excl_cl <- function(data_wide, cluster_labels, s) { epo_w <- data_wide %>% epo_tpf(s = s) full_sr <- data_wide %>% sr_func(w = epo_w) excl_one <- c("Market", unique(cluster_labels$hcl_label)) %>% lapply(function(c) { cl_chars <- cluster_labels %>% filter(characteristic %in% colnames(data_wide) & hcl_label != c) %>% pull(characteristic) if (c != "Market") { cl_chars <- c(cl_chars, "market") } # All minus Cluster SR data <- data_wide %>% select(all_of(cl_chars)) w <- data %>% epo_tpf(s = s) sr <- data %>% sr_func(w = w) tibble(hcl_label=c, sr=sr) }) %>% bind_rows() excl_one %>% mutate(hcl_label = hcl_label %>% factor(levels = c(cluster_order, "Market"))) %>% ggplot(aes(reorder(hcl_label, -sr), (full_sr-sr) / full_sr, fill = hcl_label)) + geom_col() + coord_flip() + theme(legend.position = "none", axis.title.y = element_blank()) + labs(y = "Percentage Drop in Monthly SR from Excluding Cluster") } # Plot TPF Factor: Single Factor Importance plot_tpf_factor_imp <- function(data_wide, cluster_labels, s) { epo_w <- data_wide %>% epo_tpf(s = s) full_sr <- data_wide %>% sr_func(w = epo_w) each_factor <- colnames(data_wide) %>% lapply(function(c) { # SR excluding char data <- data_wide %>% select(-all_of(c)) w <- data %>% epo_tpf(s = opt_s) sr <- data %>% sr_func(w = w) # Output tibble(characteristic = c, sr = sr) }) %>% bind_rows() each_factor %>% left_join(cluster_labels, by = "characteristic") %>% mutate( hcl_label = if_else(characteristic == "market", "Market", hcl_label), hcl_label = hcl_label %>% factor(levels = c(cluster_order, "Market")), drop = full_sr - sr, drop_prop = drop/full_sr ) %>% arrange(drop_prop) %>% tail(10) %>% ggplot(aes(reorder(characteristic, drop_prop), drop_prop*100, fill = hcl_label)) + geom_col() + coord_flip() + theme(axis.title.y = element_blank()) + labs(y = "Drop in TPF SR (% of full)", fill = "Cluster") each_factor %>% left_join(cluster_labels, by = "characteristic") %>% mutate( hcl_label = if_else(characteristic == "market", "Market", hcl_label), hcl_label = hcl_label %>% factor(levels = c(cluster_order, "Market")), drop = full_sr - sr, drop_prop = drop/full_sr, rank = frank(-drop_prop) ) %>% group_by(hcl_label) %>% filter(characteristic != "market") %>% # filter(drop_prop == max(drop_prop)) %>% ggplot(aes(reorder(characteristic, drop_prop), drop_prop*100, fill = hcl_label)) + geom_col() + coord_flip() + theme(axis.title.y = element_blank(), legend.position = "none") + labs(y = "Drop in TPF SR (% of full)", fill = "Cluster")+ facet_wrap(~hcl_label, scales = "free_y") } # Plot TPF Factor: Single Factor Importance within Cluster plot_tpf_factor_imp_cluster <- function(data_wide, cluster_labels, s) { within_cluster <- unique(cluster_labels$hcl_label) %>% lapply(function(c) { cl_chars <- cluster_labels %>% filter(characteristic %in% colnames(data_wide) & hcl_label == c) %>% pull(characteristic) # Full Cluster tpf data <- data_wide %>% select(market, all_of(cl_chars)) w_all <- data %>% epo_tpf(s = opt_s) sr_all <- data %>% sr_func(w = w_all) # Individual sr_chars <- cl_chars %>% lapply(function(char) { sub <- data %>% select(-all_of(char)) w_sub <- sub %>% epo_tpf(s = s) sr_sub <- sub %>% sr_func(w = w_sub) tibble(excl_char = char, sr = sr_sub) }) %>% bind_rows() sr_chars %>% mutate(hcl_label = c, sr_all = sr_all) }) %>% bind_rows() within_cluster %>% mutate( drop = sr_all - sr, drop_prop = drop/sr_all, hcl_label = hcl_label %>% factor(levels = cluster_order) ) %>% group_by(hcl_label) %>% # filter(drop == max(drop)) %>% ggplot(aes(reorder(excl_char, drop_prop), drop_prop*100, fill = hcl_label)) + geom_col() + coord_flip() + theme(axis.title.y = element_blank(), legend.position = "none") + labs(y = "Drop in cluster TPF SR (% of full cluster)", fill = "Cluster") + facet_wrap(~hcl_label, scales = "free") } # Plot TPF Factor: The Evolution of the TPF plot_tpf_evolution <- function(data, data_wide, char_info, orig_sig_values, s) { mkt_sr <- mean(data_wide$market) / sd(data_wide$market) years <- data %>% filter(year(eom) > min(char_info$sample_end)) %>% mutate(year = year(eom)) %>% pull(year) %>% unique() sr_over_time <- years %>% lapply(function(y) { discovered_chars <- char_info %>% filter(sample_end <= y & significance %in% orig_sig_values) %>% pull(characteristic) sub <- data_wide %>% select(all_of(discovered_chars), "market") w <- sub %>% epo_tpf(s = s) # tibble(w = w, char = colnames(sub)) %>% left_join(char_info %>% select("char"=characteristic, sample_end), by = "char") %>% arrange(-w) # 2002: seasonality coincides with noa_at that also gets a large weight tibble(year = y, tpf_sr = sr_func(sub, w), n = ncol(sub)) }) %>% bind_rows() sr_over_time <- sr_over_time %>% bind_rows(tibble(year = min(years)-1, tpf_sr = mkt_sr, n = 0)) (sr_plot <- sr_over_time %>% ggplot(aes(year, tpf_sr)) + geom_point() + geom_line() + ylim(c(0, NA)) + # geom_hline(yintercept = full_sr) + annotate("text", x = 1971, y = 0, label = "Market", colour='black') + geom_segment(aes(x = 1971, y = 0.02, xend = 1971, yend = 0.1), size=0.1,arrow = arrow(length = unit(0.2, "cm"))) + labs(y = "Ex-Post Tangency SR", x = "Year") + theme(axis.title.x = element_blank()) + annotate("text", x = 1972, y = 0.34, label = "Beta", colour='black') + geom_segment(aes(x = 1972, y = 0.31, xend = 1972, yend = 0.23), size=0.1,arrow = arrow(length = unit(0.2, "cm"))) + annotate("text", x = 1979, y = 0.45, label = "Earning-to-Price", colour='black') + geom_segment(aes(x = 1979, y = 0.42, xend = 1979, yend = 0.30), size=0.1,arrow = arrow(length = unit(0.2, "cm"))) + annotate("text", x = 1983, y = 0.08, label = "Earnings Momentum", colour='black') + geom_segment(aes(x = 1981, y = 0.10, xend = 1981, yend = 0.25), size=0.1,arrow = arrow(length = unit(0.2, "cm"))) + annotate("text", x = 1989, y = 0.25, label = "Price Momentum", colour='black') + geom_segment(aes(x = 1989, y = 0.27, xend = 1989, yend = 0.39), size=0.1,arrow = arrow(length = unit(0.2, "cm"))) + annotate("text", x = 1991, y = 0.73, label = "Operating Accruals", colour='black') + geom_segment(aes(x = 1991, y = 0.69, xend = 1991, yend = 0.6), size=0.1,arrow = arrow(length = unit(0.2, "cm"))) + annotate("text", x = 2002, y = 0.9, label = "Seasonality", colour='black') + geom_segment(aes(x = 2002, y = 0.87, xend = 2002, yend = 0.78), size=0.1,arrow = arrow(length = unit(0.2, "cm")))) n_plot <- sr_over_time %>% ggplot(aes(year, n)) + geom_point() + geom_line() + labs(y = "Factor Discovered", x = "Year of Discovery") list("plot"=cowplot::plot_grid(sr_plot, n_plot, ncol = 1, rel_heights = c(0.7, 0.3)), "data"=sr_over_time) } # Plot performance over time plot_ts <- function(data, oos, alphas, scale, orig_sig, start = as.Date("1986-01-01")) { data[, region := case_when( region == "us" ~ "US", region == "world_ex_us" ~ "World ex. US" )] data <- data[eom >= start] data <- cluster_labels[data, on = "characteristic"] if (oos) { data <- setDT(char_info)[, .(characteristic, sample_end)][data, on = .(characteristic)] data <- data[year(eom) > sample_end][, sample_end := NULL] } if (orig_sig) { data <- setDT(char_info)[, .(characteristic, significance)][data, on = .(characteristic)] data <- data[significance==T][, significance := NULL] } y_axis <- paste0("Cumulative ", if_else(alphas==T, "Alpha ", "Excess Return "), if_else(oos==T, "(OOS)", "(IS)")) agg <- data[, .(ret = mean(ret), mkt = mean(mkt_vw_exc)), by = .(region, eom)] if (alphas) { agg[, ret := ret - cov(ret,mkt)/var(mkt)*mkt, by = .(region)] } if (scale) { agg[, ret := ret / (sd(ret)*sqrt(12)/0.1), by = .(region)] } agg %>% setorder(region, eom) agg[, cumret_app := cumsum(ret), by = region] plot <- agg %>% ggplot(aes(eom, cumret_app, colour = region)) + geom_line() + labs(y = y_axis) + theme( legend.position = c(0.85, 0.35), legend.title = element_blank(), axis.title.x = element_blank() ) # Table tbl <- agg %>% group_by(region) %>% summarise( n = n(), meanret = mean(ret), vol = sd(ret), ret_vol = meanret/vol*sqrt(12), t = meanret/(vol/sqrt(n)) ) %>% mutate(meanret = meanret*12) tbl %>% select(region, ret_vol, t) %>% pivot_longer(c(ret_vol, t)) %>% mutate( value = formatC(value, digits=2, format = "f"), value = if_else(name == "t", paste0("(", value, ")"), value) ) %>% mutate( region = if_else(name == "t", "", region) ) %>% select(-name) %>% rename("Region"=region, "Full sample"=value) %>% xtable(align = "llc") %>% print(include.rownames = F) # Output return(plot) } # Plot OOS performance of significant factors plot_sig_oos <- function(sig_oos_pfs, sig_type, cutoff_2012, first_date, leg_pos) { full <- sig_oos_pfs %>% filter(eom >= first_date) %>% group_by(region, type, significant) %>% mutate( a = ret - cov(mkt,ret)/var(mkt)*mkt ) %>% summarise( n = n(), meanret = mean(ret), sd = sd(ret), sr = meanret/sd * sqrt(12), alpha = mean(a), resvol = sd(a), ir = alpha/resvol*sqrt(12), t_alpha = alpha/(resvol/sqrt(n)) ) %>% mutate(alpha = alpha*12) %>% filter(region %in% c("us", "world_ex_us") & type == sig_type) %>% setDT() post_harvey <- sig_oos_pfs %>% filter(eom >= first_date) %>% group_by(region, type, significant) %>% mutate( a = ret - cov(mkt,ret)/var(mkt)*mkt ) %>% filter(eom >= as.Date("2013-01-01")) %>% summarise( n = n(), meanret = mean(ret), sd = sd(ret), sr = meanret/sd * sqrt(12), alpha = mean(a), resvol = sd(a), ir = alpha/resvol*sqrt(12), t_alpha = alpha/(resvol/sqrt(n)) ) %>% mutate(alpha = alpha*12) %>% filter(region %in% c("us", "world_ex_us") & type == sig_type) %>% setDT() cumret <- sig_oos_pfs %>% filter(eom >= first_date) %>% group_by(region, type, significant) %>% arrange(region, type, significant, eom) %>% filter(type == sig_type & region %in% c("us", "world_ex_us")) %>% mutate( alpha = ret - cov(mkt,ret)/var(mkt)*mkt, alpha = alpha / (sd(alpha)*sqrt(12)/0.1), cum_alpha = cumsum(alpha), region_pretty = case_when( region == "us" ~ "U.S.", region == "world_ex_us" ~ "World ex. U.S." ) ) # Figure sig_oos <- cumret %>% ggplot(aes(eom, cum_alpha, colour = region_pretty, linetype = region_pretty)) + geom_line() + labs(y = "Cumulative Alpha") + theme( axis.title.x = element_blank(), legend.position = leg_pos, legend.title = element_blank() ) # Table for caption tbl <- rbind( full %>% select(region, ir, t_alpha) %>% mutate(sample = "Full sample"), post_harvey %>% select(region, ir, t_alpha) %>% mutate(sample = "Post Harvey et al") ) %>% pivot_longer(c(ir, t_alpha)) %>% mutate( value = formatC(value, digits=2, format = "f"), value = if_else(name == "t_alpha", paste0("(", value, ")"), value) ) %>% pivot_wider(names_from = sample, values_from = value) %>% mutate( region = case_when( region=="us" ~ "IR: US", region == "world_ex_us" ~ "IR: World ex. US" ), region = if_else(name == "t_alpha", "", region) ) %>% select(-name) %>% rename("Region"=region) if (cutoff_2012) { sig_oos <- sig_oos + geom_vline(xintercept = as.Date("2012-12-31"), linetype = "dotted", alpha = 1) tbl %>% xtable(align = "llcc") %>% print(include.rownames = F) } else { tbl %>% select(-`Post Harvey et al`) %>% xtable(align = "llc") %>% print(include.rownames = F) } # Output return(sig_oos) } # EB Posterior checks eb_plots <- function(eb, plot = "shrinkage") { if (plot == "cluster_distribution") { a <- eb$mle %>% filter(estimate == "alpha") %>% pull(ml_est) tb <- eb$mle %>% filter(estimate == "tau_bar") %>% pull(ml_est) op <- data.frame(x=c(a-3*tb, a+3*tb)) %>% # data.frame(x=c(-1.2, 1.2)) %>% ggplot(aes(x)) + stat_function(fun=function(x) dnorm(x = x, mean = a, sd = tb)) + labs(x = "Alpha", y = "Density", title = "Population Cluster Distribution ~ N(a0, tau_bar)") } if (plot == "factor_distribution") { a <- eb$mle %>% filter(estimate == "alpha") %>% pull(ml_est) tb <- eb$mle %>% filter(estimate == "tau_bar") %>% pull(ml_est) tt <- eb$mle %>% filter(estimate == "tau_tilde") %>% pull(ml_est) factor_sd <- sqrt(tb^2 + tt^2) op <- data.frame(x=c(a-3*factor_sd, a+3*factor_sd)) %>% # data.frame(x=c(-1.2, 1.2)) %>% ggplot(aes(x)) + stat_function(fun=function(x) dnorm(x = x, mean = a, sd = factor_sd)) + labs(x = "Alpha", y = "Density", title = "Population Factor Distribution ~ N(a0, tau_bar + tau_tilde)") } if (plot == "factor") { op <- eb$factors %>% ggplot(aes(reorder(characteristic, post_mean), post_mean)) + geom_point() + geom_errorbar(aes(ymin = p025, ymax = p975)) + facet_wrap(~region) + theme( axis.title.x = element_blank(), axis.text.x = element_text(size = 5, angle = 90, vjust = 0, hjust = 1), text = element_text(size = 10) ) } if (plot == "cluster") { op <- eb$clusters %>% ggplot(aes(reorder(hcl_label, post_mean), post_mean)) + geom_point() + geom_errorbar(aes(ymin = post_mean - 1.96 * post_sd, ymax = post_mean + 1.96 * post_sd)) + geom_hline(yintercept = 0, linetype = "dotted") + labs(y = "Posterior Distribution of Cluster Alpha") + theme( axis.title.x = element_blank(), axis.text.x = element_text(size = 8) ) } if (plot == "signal") { op <- eb$signal %>% ggplot(aes(characteristic, post_mean)) + geom_point() + geom_errorbar(aes(ymin = post_mean - 1.96 * post_sd, ymax = post_mean + 1.96 * post_sd)) } if (plot == "factor_shrinkage") { op <- eb$factors %>% select(characteristic, region, "eb" = post_mean, "ols" = ols_est) %>% gather(eb, ols, key = "type", value = "alpha") %>% group_by(characteristic) %>% mutate(sort_var = alpha[region == "us" & type == "ols"]) %>% ggplot(aes(reorder(characteristic, sort_var), alpha, colour = region)) + geom_point() + facet_wrap(~type, ncol = 1) + theme( axis.title.x = element_blank(), axis.text.x = element_text(size = 5, angle = 90, vjust = 0, hjust = 1) ) } if (plot == "se") { op <- eb_act$factors %>% mutate(ols_p025 = ols_est - 1.96 * ols_se) %>% mutate(se_diff = (ols_se-post_sd) / ols_se) %>% ggplot(aes(x = region, y = se_diff*100, colour = region)) + geom_boxplot() + expand_limits(y = 0) + theme( axis.title.x = element_blank(), legend.position = "none" ) + labs(y = "(SE_ols - SE_eb) / SE_ols * 100") } if (plot == "repl") { repl_table <- eb$factors %>% group_by(region) %>% summarise( repl_eb = mean(p025 > 0), repl_ols = mean(ols_est - 1.96*ols_se > 0) ) print(repl_table) op <- eb$factors %>% group_by(region, hcl_label) %>% summarise( mean_alpha = mean(post_mean), rep_rate = mean(p025>0) ) %>% group_by(hcl_label) %>% mutate(sort_var = rep_rate[region == "US"]) %>% ggplot(aes(reorder(hcl_label, sort_var), rep_rate)) + geom_col() + facet_wrap(~region, ncol = 1) + theme( axis.title.x = element_blank(), axis.text.x = element_text(size = 7) ) } if (plot == "significance") { op <- bind_rows( eb$factors %>% select(char_reg, characteristic, hcl_label, region, "alpha" = post_mean, "se" = post_sd) %>% mutate(type = "hlm"), eb$factors %>% select(char_reg, characteristic, hcl_label, region, "alpha" = ols_est, "se" = ols_se) %>% mutate(type = "ols") ) %>% mutate(significant = alpha - 1.96 * se > 0) %>% group_by(hcl_label) %>% mutate( sort_var = median(alpha[type == "ols" & region == "US"]), type = case_when( type == "hlm" ~ "Empirical Bayes", type == "ols" ~ "OLS" ) ) %>% ggplot(aes(reorder(characteristic, sort_var), alpha, colour = significant)) + geom_point() + geom_errorbar(aes(ymin = alpha - 1.96*se, ymax = alpha + 1.96*se)) + geom_hline(yintercept = 0, colour = "black") + facet_wrap(region~type, ncol = 2) + labs(y = "Alpha", colour = "Significant") + theme( axis.title.x = element_blank(), axis.text.x = element_text(size = 5, angle = 90, vjust = 0, hjust = 1) ) } if (plot == "shrinkage") { op <- eb$factors %>% select(region, characteristic, "hlm_alpha_mean" = post_mean, "ols_alpha" = ols_est) %>% gather(-characteristic, -region, key = "type", value = "alpha") %>% group_by(characteristic, region) %>% mutate(sort_var = sum(alpha * (type == "ols_alpha"))) %>% group_by(region) %>% mutate( rank = frank(sort_var, ties.method = "max") / 2, type = case_when( type == "hlm_alpha_mean" ~ "Empricial Bayes Posterior Mean", type == "ols_alpha" ~ "OLS Estimate" ) ) %>% ggplot(aes(rank, alpha, shape = type, colour = type, group = type)) + geom_smooth(method = "lm", se = F, formula = "y ~ x") + geom_point() + facet_wrap(~region) + theme( axis.text.x = element_text(size = 7), text = element_text(size = 10) ) + labs(x = "Rank OLS Alpha", y = "Alpha", colour = "Type", shape = "Type") } if (plot == "comparison") { op <- bind_rows( eb$factors %>% select(char_reg, characteristic, hcl_label, region, "alpha" = post_mean, "se" = post_sd) %>% mutate(type = "hlm"), eb$factors %>% select(char_reg, characteristic, hcl_label, region, "alpha" = ols_est, "se" = ols_se) %>% mutate(type = "ols") ) %>% group_by(hcl_label) %>% mutate( sort_var = median(alpha[type == "ols" & region == "US"]), type = case_when( type == "hlm" ~ "Empirical Bayes", type == "ols" ~ "OLS" ) ) %>% ggplot(aes(reorder(characteristic, sort_var), alpha, colour = hcl_label)) + geom_point() + geom_errorbar(aes(ymin = alpha - 1.96*se, ymax = alpha + 1.96*se)) + geom_hline(yintercept = 0, colour = "black") + facet_wrap(region~type, ncol = 2) + labs(y = "Alpha", colour = "Cluster") + theme( axis.title.x = element_blank(), axis.text.x = element_text(size = 5, angle = 90, vjust = 0, hjust = 1) ) } if (plot == "cluster_density") { op <- eb$clusters %>% group_by(hcl_label) %>% nest() %>% mutate(randoms = data%>% map(~rnorm(1000, mean = .x$post_mean, sd = .x$post_sd))) %>% unnest(randoms) %>% ggplot(aes(x = randoms, fill = hcl_label)) + geom_density(alpha = 0.5) # + facet_wrap(~hcl_label) } print(op) } ================================================ FILE: Analysis/1 - Prepare Data.R ================================================ # Prepare Support Data --------------------------------- # Market Returns market_returns <- fread(paste0(data_path, "/market_returns.csv"), colClasses = c("eom"="character")) market_returns[, eom := eom %>% as.Date(format = "%Y-%m-%d")] market_returns <- market_returns[, .(excntry, eom, mkt_vw_exc, stocks, me_lag1)] market_returns <- market_returns[ eom >= settings$start_date & eom <= settings$end_date & !(excntry %in% settings$country_excl) & !(excntry == "PER" & eom == as.Date("1992-01-31") & mkt_vw_exc >= 8900) & # Huge outlier !(excntry == "VEN" & eom == as.Date("2018-02-28") & mkt_vw_exc < -1)] # Something is clearly wrong # Labels char_info <- readxl::read_xlsx("Factor Details.xlsx", sheet = "details", range = "A1:N300") %>% select("characteristic"=abr_jkp, direction, significance, date_range = `in-sample period`, "hxz_group"=group) %>% filter(!is.na(characteristic)) %>% mutate( direction = direction %>% as.integer, sample_start = date_range %>% str_extract("^\\d+") %>% as.integer(), sample_end = date_range %>% str_extract("\\d+$") %>% as.integer() ) base_chars <- char_info$characteristic # Country Classification country_classification <- readxl::read_xlsx("Country Classification.xlsx", sheet = "countries", range = "A1:C200") %>% select(excntry, msci_development, region) %>% filter(!is.na(excntry)) %>% setDT() # Regions region_info <- tibble( name = c("us", "developed", "emerging", "frontier", "world", "world_ex_us"), country_codes = list( "USA", country_classification[msci_development == "developed" & excntry != "USA"]$excntry, country_classification[msci_development == "emerging"]$excntry, country_classification[msci_development == "frontier"]$excntry, country_classification$excntry, country_classification[excntry != "USA"]$excntry ), countries_min = c(1, rep(settings$countries_min, 3), 1, 3) ) # Prepare Data -------------------------------------------------------- # HML ---------------------- hml <- fread(paste0(data_path, "/hml.csv"), colClasses = c("eom"="character")) hml[, eom := eom %>% as.Date(format = "%Y-%m-%d")] # Choose weighting hml[excntry == "USA", ret := case_when( settings$weighting$us == "vw" ~ ret_vw, settings$weighting$us == "ew" ~ ret_ew, settings$weighting$us == "vw_cap" ~ ret_vw_cap )] hml[excntry != "USA", ret := case_when( settings$weighting$global_ex_us == "vw" ~ ret_vw, settings$weighting$global_ex_us == "ew" ~ ret_ew, settings$weighting$global_ex_us == "vw_cap" ~ ret_vw_cap )] # Screens hml <- hml %>% filter( characteristic %in% base_chars, eom >= settings$start_date & eom <= settings$end_date, !is.na(ret) & n_stocks_min >= settings$n_stocks_min, # Min Stocks is the big one, it removes close to 50% of the obs. with n_stocks_min>=10. Perhaps it's too stringent. !(excntry %in% settings$country_excl) ) %>% select(-signal, -n_stocks_min) # Set direction as original study hml <- hml %>% left_join(char_info %>% select(characteristic, direction), by = "characteristic") %>% mutate(ret = ret * direction) %>% select(-ret_vw, -ret_ew, -ret_vw_cap) # Ensure no Duplicates if(hml[, .N, by = .(characteristic, excntry, eom)][, max(N)] > 1) { warning("THE DATA HAS DUPLICATES") } # Regional Portfolios ------------------------------------------------ regional_data <- function(data, countries, weighting, countries_min, months_min, size_grps = F) { # Determine Country Weights weights <- market_returns[, .(excntry, eom, mkt_vw_exc, "country_weight" = case_when( weighting == "market_cap" ~ me_lag1, weighting == "stocks" ~ as.double(stocks), weighting == "ew" ~ 1) )] # Portfolio Return pf <- data[excntry %in% countries] pf <- weights[pf, on = .(excntry, eom)] if (size_grps) { pf <- pf[, .( n = .N, ret = sum(ret*country_weight) / sum(country_weight), mkt_vw_exc = sum(mkt_vw_exc * country_weight) / sum(country_weight) ), by = .(characteristic, size_grp, eom)] } else { pf <- pf[, .( n = .N, ret = sum(ret*country_weight) / sum(country_weight), mkt_vw_exc = sum(mkt_vw_exc * country_weight) / sum(country_weight) ), by = .(characteristic, eom)] } # Minimum Requirement: Countries pf <- pf[n >= countries_min] # Minimum Requirement: Months pf[, months := .N, by = .(characteristic)] pf <- pf[months >= months_min] return(pf) } regional_pfs <- 1:nrow(region_info) %>% lapply(function(i) { info <- region_info[i, ] reg_pf <- hml %>% regional_data(countries = unlist(info$country_codes), weighting = settings$country_weighting, countries_min = info$countries_min, months_min = settings$months_min) reg_pf %>% mutate(region = info$name) }) %>% bind_rows() # Characteristic Managed Portfolios ---------------------- cmp <- fread(paste0(data_path, "/cmp.csv"), colClasses = c("eom"="character")) cmp[, eom := eom %>% as.Date(format="%Y-%m-%d")] # Screens cmp <- cmp %>% rename(ret = ret_weighted) %>% filter( characteristic %in% base_chars, eom >= settings$start_date & eom <= settings$end_date, !is.na(ret) & signal_weighted != 0 & n_stocks >= settings$n_stocks_min * 2, !(excntry %in% settings$country_excl) ) %>% select(-signal_weighted, -n_stocks) # Determine Direction cmp <- cmp %>% left_join(char_info %>% select(characteristic, direction), by = "characteristic") %>% mutate(ret = ret * direction) # Ensure no Duplicates if(cmp[, .N, by = .(characteristic, excntry, size_grp, eom)][, max(N)] > 1) { warning("THE DATA HAS DUPLICATES") } # Regional Portfolios region_info_cmp <- region_info %>% filter(name == "us") regional_pfs_cmp <- 1:nrow(region_info_cmp) %>% lapply(function(i) { info <- region_info[i, ] reg_pf <- cmp %>% regional_data(countries = unlist(info$country_codes), weighting = settings$country_weighting, countries_min = info$countries_min, months_min = settings$months_min, size_grps = T) reg_pf %>% mutate(region = info$name) }) %>% bind_rows() # Regional Market Returns --- regional_mkt_ret <- 1:nrow(region_info) %>% lapply(function(i) { info <- region_info[i, ] mkt <- market_returns[excntry %in% unlist(info$country_codes), .(n = .N, market = sum(mkt_vw_exc * me_lag1) / sum(me_lag1)), by = eom] mkt <- mkt[n >= info$countries_min][, n:= NULL] mkt %>% mutate(region = info$name) }) %>% bind_rows() print(paste("Total Characteristics:", uniqueN(regional_pfs$characteristic))) ================================================ FILE: Analysis/2 - Determine Clusters.R ================================================ # Hierachical Clustering ---------------------------------------------- factor_hcl <- function(cor_mat, linkage = "ward.D", k, direction_bars = T) { dist_mat <- as.dist((1-cor_mat)) # dist_mat <- as.dist(sqrt((1-cor_mat)*2)) # With ward.D2 gives the same clusters hcl <- dist_mat %>% hclust(method=linkage) print(str_c("Cophenetic Correlation between Dendogram and Distance Matrix = ", format(cor(cophenetic(hcl), dist_mat), digits = 2, nsmall = 2))) hcl_labels <- hcl %>% cutree(k = k) %>% as_tibble(rownames = "characteristic") %>% setDT() %>% setnames(c("characteristic", "hcl")) hcl_col <- rep(colours_theme[c(1, 2, 3, 4, 5, 7, 9, 11)], ceiling(k/8))[1:k] dend <- hcl %>% as.dendrogram() %>% dendextend::set("labels_col", value = hcl_col, k=k) %>% dendextend::set("branches_k_color", value = hcl_col, k=k) %>% dendextend::set("labels_cex", value = 0.5) %>% dendextend::set("branches_lty", 1) %>% dendextend::set("branches_lwd", 0.2) dend %>% plot(horiz=T) return_list <- list( "cor" = cor_mat, "labels" = hcl_labels, "dend" = dend ) if (direction_bars) { bar_colours <- tibble("characteristic" = colnames(cor_mat)) %>% left_join(char_info %>% select(characteristic, direction), by = "characteristic") %>% mutate(col_dir = if_else(direction == 1, "black", "white")) colored_bars(colors = bar_colours %>% select(col_dir), dend = dend, rowLabels = c("Long High"), y_shift = 3, horiz = T) return_list$bar_colours <- bar_colours } return(return_list) } hcl_input <- function(data, ret_type = "alpha", ...) { # ret_type %in% c("raw", "alpha") data <- copy(data) # Avoid modifying in place if (ret_type == "raw") { data[, ret_hcl := ret] } if (ret_type == "alpha") { data[, ret_hcl := ret - mkt_vw_exc * cov(ret, mkt_vw_exc)/var(mkt_vw_exc), by = characteristic] } data %>% select(characteristic, eom, ret_hcl) %>% spread(key = characteristic, value = ret_hcl) %>% select(-eom) %>% cor(...) } # US Clusters ----------- clusters <- regional_pfs %>% filter(region == settings$hcl$region & year(eom) >= settings$hcl$start_year) %>% hcl_input(ret_type = settings$hcl$ret_type, method = settings$hcl$cor_method, use = "pairwise.complete.obs") %>% factor_hcl(linkage = settings$hcl$linkage, k = settings$hcl$k, direction_bars = T) # Cluster Labels if (settings$weighting$us == "vw_cap" & settings$hcl$k == 13 & settings$hcl$region == "us" & settings$hcl$start_year == 1975) { clusters$labels <- clusters$labels %>% mutate( hcl_label = case_when( hcl == 1 ~ "Low Leverage", hcl == 2 ~ "Investment", hcl == 3 ~ "Size", hcl == 4 ~ "Value", hcl == 5 ~ "Quality", hcl == 6 ~ "Low Risk", hcl == 7 ~ "Debt Issuance", hcl == 8 ~ "Seasonality", hcl == 9 ~ "Accruals", hcl == 10 ~ "Profitability", hcl == 11 ~ "Profit Growth", hcl == 12 ~ "Short-Term Reversal", hcl == 13 ~ "Momentum", TRUE ~ as.character(hcl)) ) } else { clusters$labels <- clusters$labels %>% mutate(hcl_label = hcl) } # Output cluster_labels <- clusters$labels %>% select(-hcl) ================================================ FILE: Analysis/3 - Analysis.R ================================================ # Empirical Bayes Estimation -------------------------- # search_list: c(regions, type, layers, size_grp) search_list <- list( "us" = list("us", "hml", 2), "developed" = list("developed", "hml", 2), "emerging" = list("emerging", "hml", 2), "all" = list(c("us", "developed", "emerging"), "hml", 3), "world" = list("world", "hml", 2), "world_ex_us" = list("world_ex_us", "hml", 2), "us_mega" = list("us", "cmp", 2, "mega"), "us_large" = list("us", "cmp", 2, "large"), "us_small" = list("us", "cmp", 2, "small"), "us_micro" = list("us", "cmp", 2, "micro"), "us_nano" = list("us", "cmp", 2, "nano") ) eb_est <- search_list %>% sapply(simplify = F, USE.NAMES = T, function(x) { print(paste("Region:", x[[1]])) regions <- x[[1]] if (x[[2]] == "cmp") { base_data <- copy(regional_pfs_cmp) %>% filter(size_grp == x[[4]]) } if (x[[2]] == "hml") { base_data <- copy(regional_pfs) } # Prepare Data data <- base_data %>% filter(eom >= settings$start_date & eom <= settings$end_date) %>% filter(region %in% regions) %>% eb_prepare( scale_alpha = settings$eb$scale_alpha, overlapping = settings$eb$overlapping ) # Run Empirical Bayes op <- data %>% emp_bayes( cluster_labels = cluster_labels, min_obs = settings$eb$min_obs, fix_alpha = settings$eb$fix_alpha, bs_cov = settings$eb$bs_cov, layers = x[[3]], shrinkage = settings$eb$shrinkage, cor_type = settings$eb$cor_type, bs_samples = settings$eb$bs_samples, seed = settings$seed ) # Output return(op) }) # Simulations EB vs. BY -------------- # Simulations if (update_sim) { # Values from Data pairwise_cor <- eb_est$us$input$long %>% select(characteristic, eom, ret_neu) %>% spread(key = characteristic, value = ret_neu) %>% select(-eom) %>% cor(use = "pairwise.complete.obs") cor_value <- pairwise_cor %>% as_tibble(rownames = "char1") %>% gather(-char1, key = "char2", value = "cor") %>% left_join(cluster_labels %>% select(characteristic, "hcl1" = hcl_label), by = c("char1"="characteristic")) %>% left_join(cluster_labels %>% select(characteristic, "hcl2" = hcl_label), by = c("char2"="characteristic")) %>% filter(char1 != char2) %>% mutate(same_cluster = (hcl1 == hcl2)) %>% group_by(same_cluster) %>% summarise(avg_cor = mean(cor)) # Time periods med_months <- eb_est$us$input$long %>% group_by(characteristic) %>% summarise(n = n()) %>% pull(n) %>% median() data <- list( yrs = round(med_months / 12), cor_within = cor_value %>% filter(same_cluster == T) %>% pull(avg_cor) %>% round(digits = 2), cor_across = cor_value %>% filter(same_cluster == F) %>% pull(avg_cor) %>% round(digits = 2) ) # Simulation Settings set.seed(settings$seed) sim <- list( "alpha_0" = 0, "t" = 12*70, # Median amount of data "clusters" = 13, "fct_pr_cl" = 10, "corr_within" = 0.58, "corr_across" = 0.02, "n_sims" = 10000, "tau_c" = c(0.01, seq(from = 0.05, to = 0.5, by = 0.05)), "tau_w" = c(0.01, 0.2) ) sim$se <- (10/sqrt(12))/sqrt(sim$t) sim$n <- sim$clusters * sim$fct_pr_cl # Check settings are consistent with data [Alert if there is a significant difference] if (abs(sim$t - data$yrs*12) > 12 | abs(sim$corr_within - data$cor_within) > 0.05 | abs(sim$corr_across - data$cor_across) > 0.05) { warning("SIMULATION AND DATA VALUES ARE NOT CONSISTENT!") print(data) print(list("yrs"=sim$t/12, "corr_within"=sim$corr_within, "corr_across"=sim$corr_across)) } simulation <- sim_mt_control(sim_settings = sim) simulation %>% saveRDS(file = paste0(object_path, "/fdr_sim.RDS")) } else { simulation <- readRDS(file = paste0(object_path, "/fdr_sim.RDS")) } # False Discovery Rate model_fdr <- fdr_sim(t_low = 0, a_vec = eb_est$us$factor_mean, a_cov = eb_est$us$factor_cov, n_sim = 10000, seed = settings$seed) # Multiple Testing Adjustments mt <- multiple_testing(eb_all = eb_est$all, eb_world = eb_est$world) # Tangency Portfolios ----------------------------- # Regions tpf_world <- eb_est$world$input$long %>% tpf_cluster(mkt_region = "world", orig_sig = T, min_date = settings$tpf$start$world, n_boots = settings$tpf$bs_samples, shorting = settings$tpf$shorting, seed = settings$seed) tpf_us <- eb_est$us$input$long %>% tpf_cluster(mkt_region = "us", orig_sig = T, min_date = settings$tpf$start$us, n_boots = settings$tpf$bs_samples, shorting = settings$tpf$shorting, seed = settings$seed) tpf_dev <- eb_est$developed$input$long %>% tpf_cluster(mkt_region = "developed", orig_sig = T, min_date = settings$tpf$start$developed, n_boots = settings$tpf$bs_samples, shorting = settings$tpf$shorting, seed = settings$seed) tpf_emer <- eb_est$emerging$input$long %>% tpf_cluster(mkt_region = "emerging", orig_sig = T, min_date = settings$tpf$start$emerging, n_boots = settings$tpf$bs_samples, shorting = settings$tpf$shorting, seed = settings$seed) # Size Groups tpf_size <- c("mega", "large", "small", "micro", "nano") %>% lapply(function(x) { eb_est[[paste0("us_", x)]]$input$long %>% tpf_cluster(mkt_region = "us", orig_sig = T, min_date = settings$tpf$start$size_grps, n_boots = settings$tpf$bs_samples, shorting = settings$tpf$shorting, seed = settings$seed) %>% mutate(size_grp = x) }) %>% bind_rows() # Single Factor TPF tpf_factors <- prepare_tpf_factors(region = settings$tpf_factors$region, orig_sig_values = settings$tpf_factors$orig_sig, start = settings$tpf_factors$start, scale = settings$tpf_factors$scale) opt_s <- tpf_factors$long %>% optimal_shrinkage(k = settings$tpf_factors$k) # Posterior Over time ----------------- ot_region <- "world" if (update_post_over_time) { for (fix_taus in c(T,F)) { if (fix_taus) { fixed_priors <- list( "alpha" = eb_est[[ot_region]]$mle %>% filter(estimate == "alpha") %>% pull(ml_est), "tau_c" = eb_est[[ot_region]]$mle %>% filter(estimate == "tau_c") %>% pull(ml_est), "tau_s" = eb_est[[ot_region]]$mle %>% filter(estimate == "tau_s") %>% pull(ml_est) ) } else { fixed_priors <- NULL } periods <- sort(unique(regional_pfs$eom)) periods <- periods[month(periods) == 12] # Only estimate once per year time_chars <- regional_pfs %>% filter(region == ot_region & eom <= as.Date("1960-12-31")) %>% group_by(characteristic) %>% filter(n() >= settings$eb$min_obs) %>% pull(characteristic) %>% unique() posterior_over_time <- periods[periods >= as.Date("1960-12-31")] %>% lapply(function(end_date) { print(end_date) # Prepare Data data <- regional_pfs %>% filter(characteristic %in% time_chars) %>% filter(eom >= settings$start_date & eom <= end_date) %>% filter(region == ot_region) %>% eb_prepare( scale_alpha = settings$eb$scale_alpha, overlapping = settings$eb$overlapping ) # Run Empirical Bayes eb_act <- data %>% emp_bayes( cluster_labels = cluster_labels, min_obs = settings$eb$min_obs, fix_alpha = settings$eb$fix_alpha, bs_cov = settings$eb$bs_cov, layers = 2, shrinkage = settings$eb$shrinkage, cor_type = settings$eb$cor_type, bs_samples = 1000, priors = fixed_priors, seed = settings$seed ) eb_act$input <- NULL eb_act$end_date <- end_date return(eb_act) }) if (fix_taus) { posterior_over_time %>% saveRDS(file = paste0(object_path, "/posterior_over_time.RDS")) } else { posterior_over_time %>% saveRDS(file = paste0(object_path, "/posterior_over_time_flex.RDS")) } } } posterior_over_time <- readRDS(file = paste0(object_path, "/posterior_over_time.RDS")) posterior_over_time_flex <- readRDS(file = paste0(object_path, "/posterior_over_time_flex.RDS")) # Size Dimension eb_us_size <- c("mega", "large", "small", "micro", "nano") %>% lapply(function(x) { eb_est[[str_c("us_", x)]]$factors %>% mutate(size_grp = x) }) %>% bind_rows() %>% mutate( size_grp = str_to_title(size_grp), size_grp = size_grp %>% factor(levels = c("Mega", "Large", "Small", "Micro", "Nano")) ) # In-Sample / Out-of-Sample ------------------------------------ is_oos <- c("pre", "post", "pre_post") %>% sapply(simplify = F, USE.NAMES = T, function(t) { data <- eb_est$us$input$long %>% prepare_is_oos(min_obs = 60, ret_scaled = "all", orig_group = T, type = t, print=T) regs <- data %>% group_by(characteristic, period, n_is, n_oos) %>% nest() %>% mutate( fit = data %>% map(~lm(ret_adj ~ mkt_vw_exc, data = .x)), tidied = fit %>% map(tidy) ) %>% unnest(tidied) %>% filter(term == "(Intercept)") %>% select(characteristic, period, n_is, n_oos, estimate) %>% spread(key = period, value = estimate) list(data=data, regs=regs) }) # Economi Benefit of More Powerful Multiple Comparison if (update_post_is) { periods <- sort(unique(regional_pfs$eom)) periods <- periods[month(periods) == 12 & year(periods) >= 1959] # Only estimate once per year posterior_is <- periods %>% lapply(function(end_date) { print(paste("Date", end_date, "-" , match(end_date, periods), "out of", length(periods))) # Prepare Data data <- regional_pfs %>% filter(eom >= settings$start_date & eom <= end_date) %>% filter(region == "us") %>% eb_prepare( scale_alpha = settings$eb$scale_alpha, overlapping = settings$eb$overlapping ) # Run Empirical Bayes eb_act <- data %>% emp_bayes( cluster_labels = cluster_labels, min_obs = settings$eb$min_obs, fix_alpha = settings$eb$fix_alpha, bs_cov = settings$eb$bs_cov, layers = 2, shrinkage = settings$eb$shrinkage, cor_type = settings$eb$cor_type, bs_samples = 1000, seed = settings$seed ) # Output eb_act$factors %>% mutate(est_date = end_date) }) %>% bind_rows() posterior_is %>% saveRDS(file = paste0(object_path, "/posterior_is.RDS")) } else { posterior_is <- readRDS(file = paste0(object_path, "/posterior_is.RDS")) } sig_oos_pfs <- posterior_is %>% trading_on_significance() # Harvey et al (2016) Simulation - Baseline ------------------------------ # We use the baseline specification from table 5 - Panel A where the average correlation is 0 (the average correlation among factors in our data is 7%) # 1300 * (1-0.396) The harvey et al numbers are m=1297 and m_true=783 harvey_base <- list( alpha_0 = 0, t = 70*12, ret = 4.4 / 12, vol = 10 / sqrt(12), cl = 26, cl_true = 16, fct_pr_cl = 50, corr_across = 0.02, corr_within = 0.58, tau_ws = c(0.21), # We estimate it at 0.21 n_sims = 50, fix_alpha = T ) harvey_base$se <- harvey_base$vol / sqrt(harvey_base$t) harvey_base$n <- harvey_base$cl * harvey_base$fct_pr_cl harvey_base$n_true <- harvey_base$cl_true * harvey_base$fct_pr_cl if (update_harvey_baseline) { harvey_base_res <- harvey_et_al_sim(sim_settings = harvey_base, seed = settings$seed) harvey_base_res <- list("settings" = harvey_base, "sim" = harvey_base_res) harvey_base_res %>% saveRDS(file = paste0(object_path, "/harvey_res_baseline.RDS")) } else { harvey_base_res <- readRDS(file = paste0(object_path, "/harvey_res_baseline.RDS")) } # Harvey et al (2016) Simulation - Worst Case # We use the worst cases specification from table 5 - Panel B where the average correlation is 0 (the average correlation among factors in our data is 7%) # 2500 * (1-0.683) = 800 The harvey numbers are m=2458 and m_true=779 harvey_worst <- list( alpha_0 = 0, t = 70*12, # Median number of years for US factor ret = 4.4 / 12, vol = 10 / sqrt(12), cl = 50, cl_true = 16, fct_pr_cl = 50, corr_across = 0.02, corr_within = 0.58, tau_ws = c(0.21), # We estimate it at 0.21 Same as what we estimate n_sims = 50, fix_alpha = T ) harvey_worst$se <- harvey_worst$vol / sqrt(harvey_worst$t) harvey_worst$n <- harvey_worst$cl * harvey_worst$fct_pr_cl harvey_worst$n_true <- harvey_worst$cl_true * harvey_worst$fct_pr_cl if (update_harvey_worstcase) { harvey_worst_res <- harvey_et_al_sim(sim_settings = harvey_worst, seed = settings$seed) harvey_worst_res <- list("settings" = harvey_worst, "sim" = harvey_worst_res) harvey_worst_res %>% saveRDS(file = paste0(object_path, "/harvey_res_worstcase.RDS")) } else { harvey_worst_res <- readRDS(file = paste0(object_path, "/harvey_res_worstcase.RDS")) } # Estimate parameters on OOS data ----------- if (FALSE) { reg <- "us" # Prepare Data data <- regional_pfs %>% filter(eom >= settings$start_date & eom <= settings$end_date) %>% filter(region==reg) %>% left_join(char_info %>% select(characteristic, sample_start, sample_end), by = "characteristic") %>% filter(year(eom) < sample_start | year(eom) > sample_end) %>% eb_prepare( scale_alpha = settings$eb$scale_alpha, overlapping = settings$eb$overlapping # If we start in 1955-02-28 we lose 12 factors relative to starting in 1972-11-30 ) # Run Empirical Bayes op <- data %>% emp_bayes( cluster_labels = cluster_labels, min_obs = settings$eb$min_obs, fix_alpha = settings$eb$fix_alpha, bs_cov = settings$eb$bs_cov, layers = 2, shrinkage = settings$eb$shrinkage, cor_type = settings$eb$cor_type, bs_samples = settings$eb$bs_samples, seed = settings$seed ) # OOS-replication rate op$factors %>% left_join(char_info %>% select(characteristic, significance), by = "characteristic") %>% filter(significance == 1) %>% summarise( n = n(), sd_ols = sd(ols_est), eb_rr = mean(p025 > 0), eb_ols = mean(ols_est - 1.96*ols_se > 0) ) # OOS Hyperparameters op$mle %>% mutate(estimate = if_else(estimate == "tau_s", "tau_w", estimate)) # Full sample Tau's eb_est[[reg]]$mle %>% mutate(estimate = if_else(estimate == "tau_s", "tau_w", estimate)) # Replication rate with OOS hyperparameters repl_rate <- function(chars, alphas, sigma, alpha0, tau_c, tau_w, cluster_labels, char_info) { # Alpha zero vector alpha0_vec <- rep(alpha0, length(alphas)) # Signal Membership cm <- tibble(characteristic = chars, "alpha"= alphas) %>% left_join(cluster_labels, by = "characteristic") m <- cm %>% mutate(cm = 1) %>% select(characteristic, hcl_label, cm) %>% spread(key = hcl_label, value = cm) %>% select(-characteristic) %>% as.matrix() m[is.na(m)] <- 0 mm <- m %*% t(m) # Omega omega <- diag(length(alphas)) * tau_w^2 + mm * tau_c^2 # Posterior post_cov <- solve(solve(omega) + solve(sigma)) post_alpha <- post_cov %*% (solve(omega) %*% alpha0_vec + solve(sigma) %*% alphas) # Replication Rate tibble( characteristic = chars, alpha = drop(post_alpha), se = sqrt(diag(post_cov)) ) %>% left_join(char_info, by = "characteristic") %>% filter(significance == T) %>% summarise( repl_rate = mean(alpha - 1.96*se > 0) ) } # Full Sample Hyper-parameters repl_rate( chars = eb_est[[reg]]$factors$characteristic, alphas = eb_est[[reg]]$factors$ols_est, sigma = eb_est[[reg]]$sigma, alpha0 = eb_est[[reg]]$mle %>% filter(estimate == "alpha") %>% pull(ml_est), tau_c = eb_est[[reg]]$mle %>% filter(estimate == "tau_c") %>% pull(ml_est), tau_w = eb_est[[reg]]$mle %>% filter(estimate == "tau_s") %>% pull(ml_est), cluster_labels = cluster_labels, char_info = char_info ) # OOS Hyper-parameters repl_rate( chars = eb_est[[reg]]$factors$characteristic, alphas = eb_est[[reg]]$factors$ols_est, sigma = eb_est[[reg]]$sigma, alpha0 = op$mle%>% filter(estimate == "alpha") %>% pull(ml_est), tau_c = op$mle %>% filter(estimate == "tau_c") %>% pull(ml_est), tau_w = op$mle %>% filter(estimate == "tau_s") %>% pull(ml_est), cluster_labels = cluster_labels, char_info = char_info ) } ================================================ FILE: Analysis/4 - Output.R ================================================ # Determine Cluster Order cluster_order <- c("Accruals", "Debt Issuance", "Investment", "Short-Term Reversal", "Value", "Low Risk", "Quality", "Momentum", "Profitability", "Profit Growth", "Seasonality", "Size", "Low Leverage") # Collect all output in list output <- list(figures = list(), tables = list()) # Headline Replication Rate headline_rr <- eb_est$us$factors %>% left_join(char_info, by = "characteristic") %>% filter(significance == 1) %>% summarise(rr = mean(p025>0)) %>% pull(rr) # Figures -------------------------------------- # HCL output$figures$hcl_us <- function(tex = F) { par(mar = c(3,2,1,10), cex = 1) c <- 1.2 x <- 37/2 label_func <- function(x) unique(clusters$labels$hcl_label)[unique(clusters$labels$hcl_label) %>% str_detect(x)] clusters_tex <- copy(clusters$dend) if (tex == T) { labels(clusters_tex) <- labels(clusters_tex) %>% str_replace_all("_", "\\\\_") } clusters_tex %>% plot(horiz=T) colored_bars(colors = clusters$bar_colours %>% select(col_dir), dend = clusters_tex, rowLabels = c("Long High"), y_shift = 11/2, horiz = T) # Labels if (settings$hcl$k == 13) { text(x = x, y = 151, label_func("Short-Term Reversal"), cex = c, col = colours_theme[5], adj = 0) #gold-9 text(x = x, y = 141, label_func("Profitability"), cex = c, col = colours_theme[4], adj = 0) #lightgreen-7 text(x = x, y = 129, label_func("Low Risk"), cex = c, col = colours_theme[3], adj = 0) #purple-5 text(x = x, y = 112, label_func("Value"), cex = c, col = colours_theme[2], adj = 0) #orange-4 text(x = x, y = 90, label_func("Investment"), cex = c, col = colours_theme[1], adj = 0) #darkgreen-3 text(x = x, y = 73, label_func("Seasonality"), cex = c, col = colours_theme[11], adj = 0) #orange-4 text(x = x, y = 63, label_func("Debt Issuance"), cex = c, col = colours_theme[9], adj = 0) #red-2 text(x = x, y = 57, label_func("Size"), cex = c, col = colours_theme[7], adj = 0) #blue-1 text(x = x, y = 51, label_func("Accruals"), cex = c, col = colours_theme[5], adj = 0) #black-11 text(x = x, y = 43, label_func("Low Leverage"), cex = c, col = colours_theme[4], adj = 0) #lightgreen-7 text(x = x, y = 30, label_func("Profit Growth"), cex = c, col = colours_theme[3], adj = 0) #purple-5 text(x = x, y = 22, label_func("Momentum"), cex = c, col = colours_theme[2], adj = 0) #darkgreen-3 text(x = x, y = 12, label_func("Quality"), cex = c, col = colours_theme[1], adj = 0) #blue-1 } } # Cluster Validation output$figures$hcl_us_val <- clusters$cor %>% cluster_val(labels = clusters$labels, op_format = "pdf") # Literature comparison print(output$figures$lit_comp <- eb_est$us %>% plot_lit_comp(mt_res = mt, eb_world = eb_est$world, excl_insig=T)) # Comparing Multiple Testing with Empirical Bayes c(output$figures$mt_factors, output$figures$mt_summary) %<-% plot_mt_eb_comp( mt = mt, eb_all = eb_est$all, eb_us = eb_est$us, eb_developed = eb_est$developed, eb_world = eb_est$world, eb_emerging = eb_est$emerging, mts = c("OLS", "Bonferroni", "BY"), regs = c("us", "developed", "emerging", "world"), se_methods = c("OLS", "BY", "EB - Region", "EB - All"), se_regions = "us") # Replication Rate by Region output$figures$gl_by_cluster <- plot_repl_region(eb_all = eb_est$all, cluster_order = cluster_order) # Global Factor Posterior output$figures$gl_by_factor <- eb_est$world %>% plot_factor_post(orig_sig = T, cluster_order = cluster_order) # CI Many Factors output$figures$ci_many_fcts <- plot_many_factors() # Tangency Portfolio - US output$figures$tpf <- tpf_us %>% plot_tpf(cluster_order = cluster_order, ci_low = 0.05, ci_high = 0.95) # Tangency Portfolio - Regions output$figures$tpf_regions <- plot_tpf_region(tpf_us = tpf_us, tpf_dev = tpf_dev, tpf_emer = tpf_emer, cluster_order = cluster_order, ci_low = 0.05, ci_high = 0.95) # Tangency Portfolio - Size Groups output$figures$tpf_size <- tpf_size %>% plot_tpf_size(cluster_order = cluster_order, ci_low = 0.05, ci_high = 0.95) # Posterior over Time - Fixed Taus output$figures$overtime <- posterior_over_time %>% plot_over_time(orig_sig = T, ols_incl = T, lb = 5, bw=F) output$figures$overtime_bw <- posterior_over_time %>% plot_over_time(orig_sig = T, ols_incl = T, lb = 5, bw=T) # Posterior over Time - Flexible Taus output$figures$overtime_flex <- posterior_over_time_flex %>% plot_over_time(orig_sig = T, ols_incl = F, lb=5, bw=F) output$figures$overtime_flex_bw <- posterior_over_time_flex %>% plot_over_time(orig_sig = T, ols_incl = F, lb=5, bw=T) # Posterior over Time - Flexible Taus - Plot taus output$figures$overtime_flex_taus <- posterior_over_time_flex %>% plot_taus_over_time() # By Size - Overall output$figures$size_overall <- eb_us_size %>% plot_size_overall(flipped = T, text = F) # By Size - Clusters output$figures$size_clusters <- eb_us_size %>% plot_size_clusters(cluster_order = cluster_order) # Model - False Discovery Rate output$figures$model_fdr <- model_fdr %>% plot_fdr() # Simulation - False Discovery Rate output$figures$sim_fdr <- simulation %>% plot_sim_fdr() # US verus world factor output$figures$world_vs_us <- plot_world_vs_us(eb_us = eb_est$us, eb_world_ex_us = eb_est$world_ex_us) # In-sample vs. OOS and Post c(output$figures$is_pre, output$figures$is_post, output$figures$is_oos) %<-% plot_is_oos_post(is_oos = is_oos, type = "GLS") # In-sample vs. OOS and Post - quadratic c(output$figures$is_pre_quad, output$figures$is_post_quad, output$figures$is_oos_quad) %<-% plot_is_oos_post_quad(is_oos = is_oos, type = "GLS") # Effect Sizes c(output$figures$effect_world, output$figures$effect_regions, output$figures$effect_size) %<-% plot_effects(type = "ols", orig_sig = T, cluster_order = cluster_order) # Replicateion Rate by Cluster - US output$figures$repl_cluster_us <- eb_est$us$factors %>% plot_repl_cluster(orig_sig = T, cluster_order = cluster_order) # Simulation benchmarked to Harvey et al (2016) if (eb_est$us$mle %>% filter(estimate == "tau_s") %>% pull(ml_est) %>% round(2) != 0.21) { warning("Tau_w in Harvey et al simulation is inconsistent with data!!") } output$figures$sim_harvey <- plot_harvey(harvey_base_res = harvey_base_res, harvey_worst_res = harvey_worst_res, tau_ws = 0.21, act_rr = headline_rr) output$figures$sim_harvey_robustness <- plot_harvey(harvey_base_res = harvey_base_res, harvey_worst_res = harvey_worst_res, tau_ws = c(0.1, 0.21, 0.3), act_rr = headline_rr) # TPF Single Factors output$figures$tpf_factors_one_cluster <- tpf_factors$wide %>% plot_tpf_one_cluster(cluster_labels = cluster_labels, s = opt_s) output$figures$tpf_factors_excl_one <- tpf_factors$wide %>% plot_tpf_excl_cl(cluster_labels = cluster_labels, s = opt_s) output$figures$tpf_factors_imp <- tpf_factors$wide %>% plot_tpf_factor_imp(cluster_labels = cluster_labels, s = opt_s) output$figures$tpf_factors_imp_cluster <- tpf_factors$wide %>% plot_tpf_factor_imp_cluster(cluster_labels = cluster_labels, s = opt_s) tpf_evol <- tpf_factors$long %>% plot_tpf_evolution(data_wide = tpf_factors$wide, char_info = char_info, orig_sig_values = settings$tpf_factors$orig_sig, s = opt_s) output$figures$tpf_evolution <- tpf_evol$plot # Cumulative returns OOS - Marginally significant factors output$figures$marg_sig_oos <- sig_oos_pfs %>% plot_sig_oos(sig_type = "marg_sig", cutoff_2012 = T, first_date = as.Date("1990-01-01"), leg_pos = c(.85, .55)) # Also include table for caption # Cummulative returns OOS - EB significant factors output$figures$eb_sig_oos <- sig_oos_pfs %>% plot_sig_oos(sig_type = "eb_sig", cutoff_2012 = F, first_date = as.Date("1990-01-01"), leg_pos = c(.85, .40)) # Also include table for caption # Save Figures as Pictures ------------------------- if (save_figures) { # Base settings output_fig <- function(path, name, format, width, height) { file <- paste0(path, "/", name, ".eps") ggsave(file=file, width = width, height = height, units = "in", dpi = 300) } h <- 5 w <- 9 # For main text ----------------------------------------- # Figure 1 ------ output$figures$lit_comp output_fig(path=output_path, name = "fig1", width = w + 1, height = h) dev.off() output$figures$lit_comp + scale_fill_manual(values = c("black", "grey35")) output_fig(path=output_path, name = "fig1_bw", width = w + 1, height = h) dev.off() # Figure 2 ----- output$figures$marg_sig_oos output_fig(path=output_path, name = "fig2", width = w, height = h*2/3) dev.off() output$figures$marg_sig_oos + scale_colour_manual(values = c("black", "black")) output_fig(path=output_path, name = "fig2_bw", width = w, height = h*2/3) dev.off() # Figure 3 ----- output$figures$sim_fdr output_fig(path=output_path, name = "fig3", format = format, width = w, height = h) dev.off() output$figures$sim_fdr + scale_colour_manual(values = c("black", "black", "black")) output_fig(path=output_path, name = "fig3_bw", format = format, width = w, height = h) dev.off() # Figure 4 ----- output$figures$mt_factors + theme(text = element_text(size = 13), axis.text.x = element_blank()) output_fig(path=output_path, name = "fig4", format = format, width = w, height = h) dev.off() output$figures$mt_factors + theme(text = element_text(size = 13), axis.text.x = element_blank()) + scale_colour_manual(values = rep("black", 3)) output_fig(path=output_path, name = "fig4_bw", format = format, width = w, height = h) dev.off() # Figure 5a ----- output$figures$size_overall + theme( axis.title.x = element_text(size = 17), axis.text.x = element_text(size = 17), axis.text.y = element_text(size = 17)) output_fig(path=output_path, name = "fig5a", format = format, width = w, height = h) dev.off() output$figures$size_overall + theme( axis.title.x = element_text(size = 17), axis.text.x = element_text(size = 17), axis.text.y = element_text(size = 17)) + geom_col(fill="grey35") output_fig(path=output_path, name = "fig5a_bw", format = format, width = w, height = h) dev.off() # Figure 5b ----- output$figures$repl_cluster_us + theme( axis.text.y = element_text(size = 13), axis.text.x = element_text(size = 14), text = element_text(size = 14)) output_fig(path=output_path, name = "fig5b", format = format, width = w, height = h) dev.off() output$figures$repl_cluster_us + theme( axis.text.y = element_text(size = 13), axis.text.x = element_text(size = 14), text = element_text(size = 14)) + scale_fill_manual(values = rep("grey35", 13)) output_fig(path=output_path, name = "fig5b_bw", format = format, width = w, height = h) dev.off() # Figure 6 ----- output$figures$mt_summary + theme( text = element_text(size = 13), axis.title.x = element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 9) ) output_fig(path=output_path, name = "fig6", format = format, width = w, height = h) dev.off() output$figures$mt_summary + theme( text = element_text(size = 13), axis.title.x = element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 9) ) + scale_fill_grey() output_fig(path=output_path, name = "fig6_bw", format = format, width = w, height = h) dev.off() # Figure 7 ----- output$figures$world_vs_us + theme(text = element_text(size = 13)) output_fig(path=output_path, name = "fig7", format = format, width = h, height = h) dev.off() output$figures$world_vs_us + theme(text = element_text(size = 13)) + scale_colour_manual(values = c("black", "grey35")) output_fig(path=output_path, name = "fig7_bw", format = format, width = h, height = h) dev.off() # Figure 8a ----- output$figures$is_pre + theme( text = element_text(size = 12), plot.title = element_text(size = 10, vjust = -2), plot.subtitle = element_text(size = 8, vjust = 0), plot.margin = unit(c(0,0,0,0), "cm") ) output_fig(path=output_path, name = "fig8a", format = format, width = w/3, height = w/3) dev.off() output$figures$is_pre + theme( text = element_text(size = 12), plot.title = element_text(size = 10, vjust = -2), plot.subtitle = element_text(size = 8, vjust = 0), plot.margin = unit(c(0,0,0,0), "cm") ) + geom_point(colour = "black") output_fig(path=output_path, name = "fig8a_bw", format = format, width = w/3, height = w/3) dev.off() # Figure 8b ----- output$figures$is_post + theme( text = element_text(size = 12), plot.title = element_text(size = 10, vjust = -2), plot.subtitle = element_text(size = 8, vjust = 0), plot.margin = unit(c(0,0,0,0), "cm") ) output_fig(path=output_path, name = "fig8b", format = format, width = w/3, height = w/3) dev.off() output$figures$is_post + theme( text = element_text(size = 12), plot.title = element_text(size = 10, vjust = -2), plot.subtitle = element_text(size = 8, vjust = 0), plot.margin = unit(c(0,0,0,0), "cm") ) + geom_point(colour = "black") output_fig(path=output_path, name = "fig8b_bw", format = format, width = w/3, height = w/3) dev.off() # Figure 8c ----- output$figures$is_oos + theme( text = element_text(size = 12), plot.title = element_text(size = 10, vjust = -2), plot.subtitle = element_text(size = 8, vjust = 0), plot.margin = unit(c(0,0,0,0), "cm") ) output_fig(path=output_path, name = "fig8c", format = format, width = w/3, height = w/3) dev.off() output$figures$is_oos + theme( text = element_text(size = 12), plot.title = element_text(size = 10, vjust = -2), plot.subtitle = element_text(size = 8, vjust = 0), plot.margin = unit(c(0,0,0,0), "cm") ) + geom_point(colour = "black") output_fig(path=output_path, name = "fig8c_bw", format = format, width = w/3, height = w/3) dev.off() # Figure 9 ----- output$figures$overtime output_fig(path=output_path, name = "fig9", format = format, width = w, height = h) dev.off() output$figures$overtime_bw output_fig(path=output_path, name = "fig9_bw", format = format, width = w, height = h) dev.off() # Figure 10 ----- output$figures$sim_harvey + theme( text = element_text(size = 12), axis.text.x = element_text(size = 11), axis.text.y = element_text(size = 11), legend.text = element_text(size = 11) ) output_fig(path=output_path, name = "fig10", format = format, width = w, height = h) dev.off() output$figures$sim_harvey + theme( text = element_text(size = 12), axis.text.x = element_text(size = 11), axis.text.y = element_text(size = 11), legend.text = element_text(size = 11) ) + scale_colour_manual(values = rep("black", 3)) output_fig(path=output_path, name = "fig10_bw", format = format, width = w, height = h) dev.off() # Figure 11 ----- output$figures$gl_by_factor + theme(text = element_text(size = 13), legend.position = "right", axis.text.x = element_text(size = 5)) output_fig(path=output_path, name = "fig11", format = format, width = w, height = h) dev.off() output$figures$gl_by_factor + theme(text = element_text(size = 13), legend.position = "right", axis.text.x = element_text(size = 5)) + scale_colour_manual(values = rep("black", 13)) output_fig(path=output_path, name = "fig11_bw", format = format, width = w, height = h) dev.off() # Figure 12 ----- output$figures$effect_size + theme( axis.text.y = element_text(size = 13), strip.text.x = element_text(size = 14), axis.text.x = element_text(size = 12), axis.title.x = element_text(size = 13) ) output_fig(path=output_path, name = "fig12a", format = format, width = w, height = h) dev.off() output$figures$effect_size + theme( axis.text.y = element_text(size = 13), strip.text.x = element_text(size = 14), axis.text.x = element_text(size = 12), axis.title.x = element_text(size = 13) ) + scale_fill_manual(values = rep("grey35", 13)) output_fig(path=output_path, name = "fig12a_bw", format = format, width = w, height = h) dev.off() # Figure 12b ----- output$figures$effect_regions + theme( axis.text.y = element_text(size = 13), strip.text.x = element_text(size = 13), axis.text.x = element_text(size = 12), axis.title.x = element_text(size = 13) ) output_fig(path=output_path, name = "fig12b", format = format, width = w, height = h) dev.off() output$figures$effect_regions + theme( axis.text.y = element_text(size = 13), strip.text.x = element_text(size = 13), axis.text.x = element_text(size = 12), axis.title.x = element_text(size = 13) ) + scale_fill_manual(values = rep("grey35", 13)) output_fig(path=output_path, name = "fig12b_bw", format = format, width = w, height = h) dev.off() # Figure 13 ----- output$figures$tpf + theme(text = element_text(size = 13), axis.text.x = element_text(size = 13), axis.text.y = element_text(size = 13)) output_fig(path=output_path, name = "fig13", format = format, width = w, height = h) dev.off() output$figures$tpf + theme(text = element_text(size = 13), axis.text.x = element_text(size = 13), axis.text.y = element_text(size = 13)) + scale_fill_manual(values = rep("grey35", 14)) output_fig(path=output_path, name = "fig13_bw", format = format, width = w, height = h) dev.off() # Figure 14 ----- # Evolution of TPF output$figures$tpf_evolution output_fig(path=output_path, name = "fig14", format = format, width = w, height = h*0.8) dev.off() output$figures$tpf_evolution output_fig(path=output_path, name = "fig14_bw", format = format, width = w, height = h*0.8) dev.off() # Figure IA.1 ----- # NEED TO DO!! if (FALSE) { # Need to run everything with settings$weighting$us = "vw" and settings$weighting$global_ex_us="vw" output$figures$lit_comp output_fig(path=output_path, name = "figIA1", format = format, width = w + 1, height = h) dev.off() } # Figure IA.2 ------ output$figures$eb_sig_oos output_fig(path=output_path, name = "figIA2", format = format, width = w, height = h*2/3) dev.off() # Figure IA.3a ----- output$figures$is_pre_quad + theme( text = element_text(size = 12), plot.title = element_text(size = 10, vjust = -2), plot.subtitle = element_text(size = 8, vjust = 0), plot.margin = unit(c(0,0,0,0), "cm") ) output_fig(path=output_path, name = "figIA3a", format = format, width = w/3, height = w/3) dev.off() # Figure IA.3b ----- output$figures$is_post_quad + theme( text = element_text(size = 12), plot.title = element_text(size = 10, vjust = -2), plot.subtitle = element_text(size = 8, vjust = 0), plot.margin = unit(c(0,0,0,0), "cm") ) output_fig(path=output_path, name = "figIA3b", format = format, width = w/3, height = w/3) dev.off() # Figure IA.3c ----- output$figures$is_oos_quad + theme( text = element_text(size = 12), plot.title = element_text(size = 10, vjust = -2), plot.subtitle = element_text(size = 8, vjust = 0), plot.margin = unit(c(0,0,0,0), "cm") ) output_fig(path=output_path, name = "figIA3c", format = format, width = w/3, height = w/3) dev.off() # Figure IA.4 ----- output$figures$overtime_flex + theme(text = element_text(size = 13)) output_fig(path=output_path, name = "figIA4", format = format, width = w, height = h) dev.off() # Figure IA.5 ----- output$figures$overtime_flex_taus + theme(text = element_text(size = 13), legend.text = element_text(size = 12)) output_fig(path=output_path, name = "figIA5", format = format, width = w, height = h) dev.off() # Figure IA.6 ---- output$figures$sim_harvey_robustness + theme( text = element_text(size = 12), axis.text.x = element_text(size = 11), axis.text.y = element_text(size = 11), legend.text = element_text(size = 11) ) output_fig(path=output_path, name = "figIA6", format = format, width = w, height = h) dev.off() # Figure IA.7 ---- output$figures$gl_by_cluster + theme(text = element_text(size = 13)) output_fig(path=output_path, name = "figIA7", format = format, width = w, height = h) dev.off() # Figure IA.8 ---- output$figures$size_clusters output_fig(path=output_path, name = "figIA8", format = format, width = w, height = h+1) dev.off() # Figure IA.9 ---- output$figures$tpf_regions + theme(text = element_text(size = 13), axis.text.x = element_text(size = 13), axis.text.y = element_text(size = 13)) output_fig(path=output_path, name = "figIA9", format = format, width = w, height = h) dev.off() # Figure IA.10 ---- output$figures$tpf_size + theme( text = element_text(size = 13), axis.text.x = element_text(size = 13), axis.text.y = element_text(size = 11)) output_fig(path=output_path, name = "figIA10", format = format, width = w, height = h*1.5) dev.off() # Figure IA.11 ---- output$figures$tpf_factors_one_cluster output_fig(path=output_path, name = "figIA11", format = format, width = w, height = h) dev.off() # Figure IA.12 ---- output$figures$tpf_factors_excl_one output_fig(path=output_path, name = "figIA12", format = format, width = w, height = h) dev.off() # Figure IA.13 ---- output$figures$tpf_factors_imp_cluster + theme( axis.text.y = element_text(size = 6) ) output_fig(path=output_path, name = "figIA13", format = format, width = w, height = h*1.5) dev.off() # Figure IA.14 ---- output$figures$tpf_factors_imp + theme( axis.text.y = element_text(size = 6) ) output_fig(path=output_path, name = "figIA14", format = format, width = w, height = h*1.5) dev.off() # Figure IA.15 (special, because not ggplot) ---- pdf(str_c(output_path, "/figIA15.pdf"), width = w, height = h*2+1.5) output$figures$hcl_us() dev.off() # Figure IA.16 (special, because not ggplot) ---- pdf(str_c(output_path, "/figIA16.pdf"), width = 6, height = 6) output$figures$hcl_us_val() dev.off() # Other figures not included in paper -------- output$figures$ci_many_fcts + theme(axis.text.y = element_text(size = 13), text = element_text(size = 13)) output$figures$model_fdr + theme(text = element_text(size = 13)) # TABLES ----------------- # Estimated Taus table_taus() # Economic Benefit of more Power sig_oos_pfs %>% table_economic_benefit() # Factor Performance table_factor_info() # Numbers mentioned in paper -------- # Bayesian Multiple Testing bayes_sim <- 1000000 (fdr_196 <- fdr_fwer_rates(t_cutoff = 1.96, orig_sig = T, a_vec = eb_est$world$factor_mean, a_cov = eb_est$world$factor_cov, n_sim = bayes_sim, seed=settings$seed)) (fdr_278 <- fdr_fwer_rates(t_cutoff = 2.78, orig_sig = T, a_vec = eb_est$world$factor_mean, a_cov = eb_est$world$factor_cov, n_sim = bayes_sim, seed=settings$seed)) (true_factors_tbl <- true_factors(t_cutoff = 0, a_vec = eb_est$world$factor_mean, a_cov = eb_est$world$factor_cov, orig_sig = T, n_sim = bayes_sim, seed=settings$seed)) (rr_unc <- true_factors(t_cutoff = 1.96, a_vec = eb_est$world$factor_mean, a_cov = eb_est$world$factor_cov, orig_sig = T, n_sim = bayes_sim, seed=settings$seed)) # Mentioned in introduction paste0("Replication rate SE: ", round(rr_unc$sd*100, 2), "%") paste0("Bayesian FDR: ", round(fdr_196$fdr_dist$mean*100, 2), "%, with 95% CI of [", round(fdr_196$fdr_dist$p025*100, 2), "%, ", round(fdr_196$fdr_dist$p975*100, 2), "%], SE: ", round(fdr_196$fdr_dist$sd*100, 2)) paste0("Bayesian FWER: ", round(fdr_196$fwer_dist$mean*100, 2), "%, with SE of ", round(fdr_196$fwer_dist$sd*100, 2), "%") paste0("Expected fraction of true factors: ", round(true_factors_tbl$mean*100, 2), "%, with SE of ", round(true_factors_tbl$sd*100, 2), "%") # BY cutoff mt %>% filter(method == "BY" & region == "us") %>% mutate(sig = p<=0.05) %>% group_by(sig) %>% mutate( max = max(abs(statistic)), min = min(abs(statistic)) ) %>% filter(abs(statistic)==max & sig==F | abs(statistic)==min & sig==T) %>% ungroup() %>% summarise( by_cutoff = mean(abs(statistic)) ) %>% print() # Change in Book equity factor be_gr_us <- eb_est$us$factors %>% filter(region == "us" & characteristic == "be_gr1a") be_gr_all <- eb_est$all$factors %>% filter(region == "us" & characteristic == "be_gr1a") tibble( characteristic = rep("be_gr1a",2), region = rep("US", 2), data = c("US", "Global"), post_mean = c(be_gr_us$post_mean, be_gr_all$post_mean), post_vol = c(be_gr_us$post_sd, be_gr_all$post_sd), t = post_mean / post_vol ) %>% print() # IS / OOS is_oos$post$regs %>% ungroup() %>% summarise( is = mean(is), post = mean(oos), decline = post/is-1 ) %>% print() c("pre","post","pre_post") %>% lapply(function(x) { is_oos[[x]]$regs %>% mutate(period = x) }) %>% bind_rows() %>% group_by(period) %>% summarise( n = n(), is = mean(is > 0), oos = mean(oos > 0) ) %>% print() # Posterior over time width posterior_over_time %>% plot_over_time(orig_sig = T, ols_incl = T, lb = 5) # Bayesian Multiple Testing fdr_196$fdr_dist fdr_278$fwer_fdr # FWER at t>2.78 true_factors_tbl # Replication rates in different size groups eb_us_size %>% plot_size_overall(flipped = T, text = T) # Publication Bias plot_harvey(harvey_base_res = harvey_base_res, harvey_worst_res = harvey_worst_res, tau_ws = 0.21, act_rr = headline_rr) # Correlations across size and region eb_us_size %>% select(characteristic, size_grp, ols_est) %>% spread(key = size_grp, value = ols_est) %>% summarise( cor_mega_micro = cor(Mega, Micro, method = "spearman"), cor_mega_nano = cor(Mega, Nano, method = "spearman") ) %>% print() eb_est$all$factors %>% select(characteristic, region, ols_est) %>% spread(key = region, value = ols_est) %>% na.omit() %>% summarise( n = n(), cor_us_dev = cor(us, developed, method = "spearman"), cor_us_emer = cor(us, emerging, method = "spearman") ) %>% print() # TPF Evolution numbers tpf_evol$data %>% arrange(year) %>% mutate(tpf_sr_l1 = dplyr::lag(tpf_sr)) %>% filter(year %in% c(min(year), max(year), 2002, 1991)) %>% # char_info[characteristic %in% c("seas_2_5an", "oaccruals_at")] arrange(year) %>% mutate( new_inclusions = case_when( year == 1971 ~ "Market", year == 1991 ~ "Accruals", year == 2002 ~ "Seasonality", year == year(settings$end_date) ~ "[All factors included]", ) ) %>% print() # Average pairwise correlations eb_est$us$input$long %>% select(characteristic, eom, ret_neu) %>% spread(key = characteristic, value = ret_neu) %>% select(-eom) %>% cor(use = "pairwise.complete.obs") %>% as_tibble(rownames = "char1") %>% gather(-char1, key = "char2", value = "cor") %>% filter(char1 != char2) %>% summarise(average_cor = mean(cor)) } ================================================ FILE: Analysis/Analysis.Rproj ================================================ Version: 1.0 RestoreWorkspace: Default SaveWorkspace: Default AlwaysSaveHistory: Default EnableCodeIndexing: Yes UseSpacesForTab: Yes NumSpacesForTab: 2 Encoding: UTF-8 RnwWeave: Sweave LaTeX: pdfLaTeX ================================================ FILE: Analysis/README.md ================================================ ## Overview This repository contains code that is used for the paper [Is There a Replication Crisis in Finance?](https://onlinelibrary.wiley.com/doi/full/10.1111/jofi.13249) by Jensen, Kelly and Pedersen (2023). The code used to create the underlying dataset can be found at [https://github.com/bkelly-lab/GlobalFactor](https://github.com/bkelly-lab/GlobalFactor). Please cite Jensen, Kelly and Pedersen (2023) if using the code or data. ## How To Run the Code 1. Start with factor returns in each country. The files you need to download are `hml.csv`, `cmp.csv` and `market_returns.csv` using one of the following methods. 1. Download the factor returns used in the paper [here](https://www.dropbox.com/sh/wcrjok1qyxtrasi/AABZ90GDCUvIzDzijt8Qoo3ha?dl=0). 1. Download the latest version of the factor returns [here](https://www.dropbox.com/sh/xq278bryrj0qf9s/AABUTvTGok91kakyL07LKyQoa?dl=0). 1. Generate the factor returns from scratch by following the steps in [GlobalFactors](https://github.com/bkelly-lab/ReplicationCrisis/tree/master/GlobalFactors) as the output from `portfolios.R`. 2. Copy the code from this repository to a local folder. 3. Open `main.R` in the programming language "R". 4. Ensure that the current working directory is the folder from 2. To check this, write `getwd()` in the console. To change the working directory use `setwd()`. 5. Run `main.R`. ## Outputs 1. The consol prints key numbers used in the paper as well as the paper tables in latex format. 2. If `save_figures=TRUE` (default), the folder in `output_path` will contain figures of the same format used in the paper. ## Optional Settings 1. `data_path` is the folder with the portfolio data from step 1 (default: current working directory/Data). 2. `object_path` is a folder where R objects can be saved for faster iterations (default: current working directory/Objects). 3. `output_path` is a folder where figures can be saved (default: current working directory/Figures). 4. `save_figures` should be `TRUE` if you wish to save figures, otherwise `FALSE` (default: TRUE). 5. `settings` controls settings for the analysis, including the start and end date, the portfolio weighting scheme, the cluster settings, the empirical Bayes settings, and the tangency portfolio settings (default: the settings in the paper). ## Notes The code is divded into 4 separate R scripts. `0 - Functions.R` contains the project functions, `1 - Prepare Data.R` prepares the data, `2 - Determine Clusters.R` finds statistical clusters based on return data, `3 - Analysis.R` analyzes the data and `4 - Output.R` generates tables and figures based on the analysis. Everything is sourced from `main.R` which also contains user-defined control variables. ================================================ FILE: Analysis/country_stats.R ================================================ library(xtable) library(tidyverse) library(data.table) # Data [output from SAS code, already screened with obs_main=1, primary_sec=1, exch_main=1] data_path <- "../../Data/Characteristics" country_files <- list.files(data_path) countries <- country_files %>% lapply(function(file) { fread(paste0(data_path, "/", file), select = c("excntry", "id", "eom", "me", "size_grp", "ret_local")) }) %>% rbindlist() countries[, eom := eom %>% as.character() %>% lubridate::fast_strptime(format = "%Y%m%d") %>% as.Date()] # Aggregate by month country_info <- countries[!is.na(me) & !is.na(ret_local), .( n = .N, n_nano = sum(size_grp == "nano"), n_mega = sum(size_grp == "mega"), me = sum(me), me_p50 = median(me) ), by = .(excntry, eom)] # Country Classification country_classification <- readxl::read_xlsx("Country Classification.xlsx", sheet = "countries", range = "A1:C200") %>% select(excntry, msci_development) %>% filter(!is.na(excntry)) %>% setDT() country_info <- country_classification[country_info, on = "excntry"] # Table table_country <- function(country_info, info_date) { tbl_caption <- paste("The table shows summary statistics by the country where a security is listed.\\", "We include common stocks that are the primary security of the underlying firm, traded on a standard exchange, with non-missing return and market equity data.\\", "\\textit{Country} is the ISO code of the underlying exchange country.\\", "For further information, see \\href{https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes}{https://en.wikipedia.org/wiki/List\\_of\\_ISO\\_3166\\_country\\_codes.}", "\\textit{MSCI} shows the MSCI classification of each country as of January 7th 2021.", "For the most recent classification, see \\href{https://www.msci.com/market-classification}{https://www.msci.com/market-classification}.", "\\textit{Start} is the first date with a valid observation.", "In the next 4 columns, the data is shown as of December 31st 2020.\\", "\\textit{Stocks} is the number of stocks available.\\", "\\textit{Mega stocks} is the number of stocks with a market cap above the 80th percentile of NYSE stocks.\\", "\\textit{Total Market Cap} is the aggregate market cap in million USD.\\", "\\textit{Median MC} is the median market cap in million USD.") # Country summary country_info[, start_date := min(eom), by = excntry] country_stats <- country_info[eom==info_date] countries_add <- country_info[!(excntry %in% country_stats$excntry)][, .(excntry, msci_development, eom = info_date, start_date, n=0, n_mega=0, n_nano=0, me=0, me_p50=0)] %>% unique()# Add countries without data by info_date country_stats <- country_stats %>% rbind(countries_add) %>% arrange(-me) %>% select(-c(n_nano, eom)) total <- country_stats %>% ungroup() %>% summarise( excntry = "All", msci_development = "", start_date = NA, n = sum(n), n_mega = sum(n_mega), me = sum(me), me_p50 = NA_real_ ) country_stats %>% bind_rows(total) %>% mutate( n = n %>% prettyNum(big.mark = ",", digits = 0), n_mega = n_mega %>% prettyNum(big.mark=",", digits = 0), start_date = as.character(start_date), me = me %>% formatC(format = "e", digits = 2), me_p50 = me_p50 %>% prettyNum(big.mark=",", digits = 0), msci_development = msci_development %>% str_to_title(), " "= ' ', ) %>% select(excntry, msci_development, start_date, ` `, everything()) %>% rename("Country" = excntry, "MSCI" = msci_development, "Start" = start_date, "Stocks" = n, "Mega Stocks" = n_mega, "Total Market Cap" = me, "Median MC" = me_p50) %>% xtable(digits = 0, align = "lllllrrrr", caption = tbl_caption) %>% print(include.rownames = T, floating = FALSE, latex.environments = "center", hline.after=c(-1, 0), tabular.environment = "longtable", size="\\fontsize{10pt}{12pt}\\selectfont") } # Output for paper-------------------- # Remember: # Only copy from "& country ..." and down. # In line 94: # - Delete "94" # - Delete "NA" # - Add hline above and below total # - Make "All" in \textbf{} country_info[!(excntry %in% c("ZWE", "VEN"))] %>% table_country(info_date = as.Date("2020-12-31")) # We exclude Zimbamwe and Venesuela due to data issues # Nano Caps in the US country_info[eom == as.Date("2020-12-31") & excntry == "USA", .(n, n_nano, nano_prop = n_nano / n)] %>% print() ================================================ FILE: Analysis/hxz_decomp.R ================================================ library(lubridate) library(tidyverse) library(data.table) options(dplyr.summarise.inform = FALSE) # User Input ----------------------- # Paths data_path <- "../../Data" # Set to path with global characteristics data. # Start Date start <- as.Date("2020-12-31") # Data ----------------------------- # Characteristics char_info <- readxl::read_xlsx("Factor Details.xlsx", sheet = "details", range = "A1:N300") %>% filter(!is.na(abr_jkp)) %>% select("characteristic"=abr_jkp, direction, significance) %>% mutate(direction = direction %>% as.numeric()) %>% setDT() chars <- char_info$characteristic # NYSE Cutoff nyse_size_cutoffs <- fread(paste0(data_path, "/nyse_cutoffs.csv"), colClasses = c("eom"="character")) nyse_size_cutoffs[, eom := as.Date(eom, format = "%Y%m%d")] # Return Cutoffs ret_cutoffs <- fread(paste0(data_path, "/return_cutoffs.csv"), colClasses = c("eom"="character")) ret_cutoffs[, eom := as.Date(eom, format = "%Y%m%d")] ret_cutoffs[, eom_lag1 := floor_date(eom, unit = "month") - 1] # Because we use ret_exc_lead1m # Data data <- fread(paste0(data_path, "/Characteristics/usa.csv"), select = c("excntry", "id", "eom", "source_crsp", "comp_exchg", "crsp_exchcd", "size_grp", "ret_exc", "ret_exc_lead1m", "me", chars), colClasses = c("eom"="character")) data[, eom := as.Date(lubridate::fast_strptime(eom, format = "%Y%m%d"))] # ME CAP data <- nyse_size_cutoffs[, .(eom, nyse_p80)][data, on = "eom"] data[, me_cap := pmin(me, nyse_p80)][, nyse_p80 := NULL] # Screens data <- data[!is.na(size_grp) & !is.na(me) & !is.na(ret_exc_lead1m)] # Winsorize Compustat Returns data <- ret_cutoffs[, .(eom, "p001"=ret_exc_0_1, "p999"=ret_exc_99_9)][data, on = "eom"] data[source_crsp == 0 & ret_exc > p999, ret_exc := p999] data[source_crsp == 0 & ret_exc < p001, ret_exc := p001] data[, c("p001", "p999") := NULL] data <- ret_cutoffs[, .("eom" = eom_lag1, "p001"=ret_exc_0_1, "p999"=ret_exc_99_9)][data, on = "eom"] data[source_crsp == 0 & ret_exc_lead1m > p999, ret_exc_lead1m := p999] data[source_crsp == 0 & ret_exc_lead1m < p001, ret_exc_lead1m := p001] data[, c("source_crsp", "p001", "p999") := NULL] # Create 1 month separated observations returns <- tidyr::crossing("id" = unique(data$id), "eom" = unique(data$eom)) %>% setDT() returns <- data[, .(id, eom, ret_exc)][returns, on = .(id, eom)] returns[, start := min(eom[!is.na(ret_exc)]), by = id] returns <- returns[eom >= start][, start := NULL] returns[, last := floor_date(max(eom[!is.na(ret_exc)]), unit = "month") + months(12) - 1, by = id] # To avoid lookahead bias, _horizon_ months after last obs returns <- returns[eom <= last][, last := NULL] returns %>% setorder(id, eom) pf_func <- function(chars, pfs, bps, bp_min_n, min_stocks, horizon) { # Realized Returns ret_lead <- 1:horizon %>% lapply(function(h) { if (h==1) { r <- data[, .(id, eom, lead = 1, ret_exc = ret_exc_lead1m)][!is.na(ret_exc)] } else { r <- returns[, .(eom, lead = h, ret_exc = dplyr::lead(ret_exc, n = h)), by = id][!is.na(ret_exc)] } r[, eom_ret := ceiling_date(eom, unit = "months")+months(h)-1] }) %>% rbindlist() # Portfolios chars %>% lapply(function(x) { print(paste0(" " , x, ": ", match(x, chars), " out of ", length(chars))) data[, var := as.double(get(x))] sub <- data[!is.na(var), .(id, eom, var, size_grp, me, me_cap, crsp_exchcd, comp_exchg)] # Portfolio Assignment if (bps == "nyse") { sub[, bp_stock := (crsp_exchcd == 1 & is.na(comp_exchg)) | (comp_exchg == 11 & is.na(crsp_exchcd))] } if (bps == "non_mc") { sub[, bp_stock := (size_grp %in% c("mega", "large", "small"))] } sub[, bp_n := sum(bp_stock), by = eom] sub <- sub[bp_n >= bp_min_n] # require at least 10 stocks for break points sub[, cdf := ecdf(var[bp_stock == T])(var), by = eom] sub[, min_cdf := min(cdf), by = eom] sub[cdf == min_cdf, cdf := 0.00000001] # To ensure that the lowest value is in portfolio 1 sub[, pf := ceiling(cdf*pfs), by = eom] sub[pf == 0, pf := 1] # Happens when non-bp stocks extend beyond bp stock range # Align with returns sub <- sub[, .(id, eom, me, me_cap, pf)][ret_lead, on = .(id, eom)][!is.na(pf)] # Returns pf_returns <- sub[, .( characteristic = x, n = .N, ret_ew = mean(ret_exc), ret_vw = sum(ret_exc * me) / sum(me), ret_vw_cap = sum(ret_exc * me_cap) / sum(me_cap) ), by = .(pf, eom_ret, lead)] # HML pf_returns[, .( characteristic = x, n_stocks_min = as.integer(min(n[pf==pfs], n[pf==1])), ret_ew = ret_ew[pf == pfs] - ret_ew[pf == 1], ret_vw = ret_vw[pf == pfs] - ret_vw[pf == 1], ret_vw_cap = ret_vw_cap[pf == pfs] - ret_vw_cap[pf == 1] ), by = .(eom_ret, lead)][!is.na(ret_ew) & n_stocks_min >= min_stocks] }) %>% rbindlist() } # Output system.time(hml_nonmc3 <- chars %>% pf_func(pfs = 3, bps = "non_mc", bp_min_n = 5, min_stocks = 5, horizon = 12)) # 47 min system.time(hml_nyse10 <- chars %>% pf_func(pfs = 10, bps = "nyse", bp_min_n = 5, min_stocks = 5, horizon = 12)) rr <- list(hml_nonmc3, hml_nyse10) %>% lapply(function(dt) { repl_data <- char_info[dt, on = "characteristic"] %>% filter(eom_ret <= start) %>% mutate( ret_ew = ret_ew*direction, ret_vw = ret_vw*direction, ret_vw_cap = ret_vw_cap*direction ) %>% pivot_longer(c(ret_ew, ret_vw, ret_vw_cap), names_to = "type", values_to = "ret") repl_data <- c(1, 6, 12) %>% lapply(function(h) { repl_data %>% filter(lead %in% 1:h) %>% group_by(characteristic, eom_ret, type, significance) %>% filter(n() == h) %>% summarise(ret = mean(ret)) %>% ungroup() %>% mutate(horizon = h) }) %>% bind_rows() rr_func <- function(sample) { sample %>% group_by(characteristic, type, horizon) %>% summarise( t = mean(ret)/(sd(ret)/sqrt(n())) ) %>% group_by(type) %>% summarise( n = n(), rr = mean(t > 1.96) ) } new_factors <- c( "ret_3_1", "ret_9_1", "ret_12_7", "corr_1260d", "rmax5_21d", "rmax5_rvol_21d", "ni_be", "ocf_at", "ocf_at_chg1", "mispricing_perf", "mispricing_mgmt", "qmj", "qmj_prof", "qmj_growth", "qmj_safety") # Our Benchline Raw Return baseline <- repl_data %>% filter(horizon == 1 & eom_ret <= start) %>% rr_func() %>% mutate(name = "Baseline") # Difference in sample period hor_diff <- repl_data %>% rr_func() %>% mutate(name = "Three Horizons") # Difference in horizons sample_diff <- repl_data %>% filter(eom_ret >= as.Date("1967-01-01") & eom_ret <= as.Date("2016-12-31")) %>% rr_func() %>% mutate(name = "Three Horizons, Shorter Sample") # Differences in Factors fct_diff <- repl_data %>% filter(eom_ret >= as.Date("1967-01-01") & eom_ret <= as.Date("2016-12-31")) %>% filter(!(characteristic %in% new_factors)) %>% rr_func() %>% mutate(name = "Three Horizons, Shorter Sample, Difference in Factors") bind_rows(baseline, sample_diff, hor_diff, fct_diff) %>% arrange(type, name) }) # Decomposition terc_base <- rr[[1]] %>% filter(name == "Baseline" & type == "ret_vw_cap") %>% pull(rr) terc_base_vw <- rr[[1]] %>% filter(name == "Baseline" & type == "ret_vw") %>% pull(rr) terc_hor <- rr[[1]] %>% filter(name == "Three Horizons" & type == "ret_vw") %>% pull(rr) terc_sample <- rr[[1]] %>% filter(name == "Three Horizons, Shorter Sample" & type == "ret_vw") %>% pull(rr) terc_factors <- rr[[1]] %>% filter(name == "Three Horizons, Shorter Sample, Difference in Factors" & type == "ret_vw") %>% pull(rr) dec_factors <- rr[[2]] %>% filter(name == "Three Horizons, Shorter Sample, Difference in Factors" & type == "ret_vw") %>% pull(rr) # From vw_cap to vw terc_base-terc_base_vw # Multiple Horizons terc_base_vw-terc_hor # Shorter sample terc_hor-terc_sample # New Factors terc_sample-terc_factors # Deciles instead of terciles and change of BP terc_factors-dec_factors # Explained Difference (expl_rr <- terc_base-((terc_base-terc_base_vw)+(terc_base_vw-terc_hor)+(terc_hor-terc_sample)+(terc_sample-terc_factors)+(terc_factors-dec_factors))) expl_rr-0.35 ================================================ FILE: Analysis/main.R ================================================ library(cowplot) library(directlabels) library(xtable) library(zeallot) library(dendextend) library(RColorBrewer) library(rsample) library(lubridate) library(tidyverse) library(data.table) options(dplyr.summarise.inform = FALSE) # How To -------------------------------------- # Paths # - data_path: Folder that contains market_returns.csv, hml.csv and cmp.csv generated from portfolio.R # - object_path: Folder to save objects too. Retrived when update_*==F # - output_path: Folder to save figures in. Not neccesary if save_figures==F # Save # - save_figures: Should figures be saved in output_path? # Update # - update_sim: Simulations for figure 2 (Simulation Comparison of False Discovery Rates) # - update_post_over_time: Posterior calculations for figure 8 (US Factor Alpha Posterior Distribution over Time) # - update_post_is: Data for regression in table E.1 (The Economic Benefit of More Powerful Tests) # - update_harvey_baseline: Data for figure 9 # - update_harvey_worstcase: Data for figure F.1 # User Input ----------------------- # Paths data_path <- "Data" object_path <- "Objects" output_path <- "Figures" # Save save_figures <- T # Update update_sim <- T update_post_over_time <- T update_post_is <- T update_harvey_baseline <- T update_harvey_worstcase <- T # Settings settings <- list( seed = 1, start_date = as.Date("1925-12-31"), end_date = as.Date("2022-12-31"), # Important that end_date <= Last_CRSP_UPDATE country_excl = c("ZWE", "VEN"), # Countries are excluded due to data issues weighting = list( # Which weighting scheme to use? In c("ew", "vw", "vw_cap") us = 'vw_cap', global_ex_us = 'vw_cap' ), n_stocks_min = 5, # Minimum amount of stocks in each side of the portfolios months_min = 5 * 12, # Minimum amount of observations a factor needs to be included country_weighting = "market_cap", # How to weight countries? In ("market_cap", "stocks", "ew") countries_min = 3, # Minimum number of countries necessary in a regional portfolio clusters = "hcl", # Which cluster method to use? In c("manual", "hcl") hcl = list( ret_type = "alpha", # Which return to use in clustering: In c("raw", "alpha") cor_method = "pearson", # Which cor method to base distance upon linkage = "ward.D", # Which linkage method to use k = 13, # How many clusters to colour region = "us", # Region to use for clusters start_year = 1975 # Start year cluster data ), eb = list( scale_alpha = T, overlapping = F, min_obs = 5 * 12, fix_alpha = T, bs_cov = T, shrinkage = 0, cor_type = "block_clusters", bs_samples = 10000 # Set to 10000 for paper ), tpf = list( start = list( world = as.Date("1952-01-01"), us = as.Date("1952-01-01"), developed = as.Date("1987-01-01"), emerging = as.Date("1994-01-01"), size_grps = as.Date("1963-01-01") # Dictated by start of nano-caps ), bs_samples = 10000, # Number of bootstrap samples [10.000 for paper] shorting = F # Should shorting be allowed? ), tpf_factors = list( region = "us", orig_sig = T, # Only include originally significant factor: T, include all: c(T,F) start = as.Date("1972-01-31"), scale = T, # Scale to ex-post volatility of 10%? k = 5 # Number of Folds for cross-validation exercise ) ) # Layout Settings --------------- theme_set(theme_classic()) colours_theme <- c("#0C6291", "#A63446", RColorBrewer::brewer.pal(8, "Dark2"), "darkslategrey", "blue3", "red3", "purple2", "yellow2", "aquamarine", "grey", "salmon", "antiquewhite", "chartreuse") scale_colour_discrete <- function(...) { scale_colour_manual(..., values = colours_theme) } scale_fill_discrete <- function(...) { scale_fill_manual(..., values = colours_theme) } scale_linetype_discrete <- function(...) { scale_linetype_manual(..., values = c("solid", "longdash", "dotted", "dashed", "dotdash", "twodash")) } # Run Scripts ---------- source("0 - Functions.R", echo = T) source("1 - Prepare Data.R", echo = T) source("2 - Determine Clusters.R", echo = T) source("3 - Analysis.R", echo = T) source("4 - Output.R", echo = T) ================================================ FILE: GlobalFactors/CHANGELOG.md ================================================ # CHANGELOG.md This change log keeps track of changes to the underlying data set. In brackets, we highlight versions of importance. The version with _factor data set_ is the basis of the factor portfolios we upload at [https://jkpfactors.com/](https://jkpfactors.com/). The version with _paper data set_ is the basis of [Jensen, Kelly and Pedersen (2023)](https://onlinelibrary.wiley.com/doi/full/10.1111/jofi.13249). ## 05-03-2025 [Factor data set] __Changes__: - Added 2024 data - moved world_data_prelim from scratch to work folder - updated market returns macro to add a capped value option - corrected error in o-score calculation - added end_date filter in macros saving daily and monthly returns ## 11-03-2024 __Changes__: - Added 2023 data - Updated the country classification according the latest MSCI market classification ## 03-03-2023 __Changes__: - Added 2022 data - Added 'me' (market equity) and 'ret' (total return) and removed 'source_crsp' from daily return files __Impact__: - Replication rate: 83.2% ## 30-06-2022 [Paper data set] __Changes__: - Changed name of "Skewness" cluster to "Short-Term Reversal" __Impact__: - Replication rate: 82.4% ## 08-02-2022 __Changes__: - Fix error in the construction of intrinsic_value. Previously, we failed to scale intrinsic_value by market equity as done in Frankel and Lee (1998). We call the new characteristic ival_me and keep intrinsic_value in the data set. The alpha of the new factor based on ival_me is significantly different from zero, while the factor based on intrinsic_value is insignificant. __Impact__: - Replication rate: 82.4% (added 2020 data) ## 16-11-2021 __Changes__: - Changed return cutoffs to depend on all stocks, instead of only stocks from CRSP. - Added monthly and daily returns to the output folder. - Changed the 'source' (character) column to 'source_crsp' (integer),. source_crsp is 1 if CRSP is the return data source. - Changed the 'id' column from character to integer. For stocks from CRSP, the id is just their permno. For stocks from Compustat, the first digits is 1 if the stocks is traded on a US exchange, 2 if it's traded on a Canadian exchange, and 3 otherwise. The next two digits are the IID from Compustat, and the remaining six digits are the gvkey. - Adapted the primary_sec column such that all observations from CRSP have primary_sec=1. - Previously, we treated a zero return as a missing observation. Now, we have removed this screen, such that a zero return is treated like any other return. - Previously, we winsorized daily returns, market equity, and dollar volume, before creating charactersitics based on daily stock market data. Now, we have removed this winsorization, and daily characteristics are based on the raw data. - Added the option to create daily factor return in the portfolios.R code. - Added the option to create industry returns in the portfolios.R code. __Impact__: - Replication rate: 83.2% ## 27-08-2021 __Changes__: - Fixed a bug regarding how daily delisting returns from CRSP is incorporated. - Added indfmt='FS' to the international accounting data. __Impact__: - Replication rate: 83.2% ## 14-06-2021 __Changes__: - We changed the winsorization scheme. First, we removed the 0.01%/99.9% winsorization of market equity in all countries. Second, we removed the winsorization of returns from the CRSP database. For Compustat returns, we set returns above (below) the 99.9% (0.01%) of CRSP returns in the same month, to that level. In other words, we base our winsorization of Compustat data on CRSP data from the same month. - We made several changes to the code for easier usability. Notably, the updated `main.sas` file returns a zip folder called "output" in the scratch folder, which contains all data neccesary to re-produce the results in the paper. __Impact__: - Replication rate: 83.2% - The revisions impacted all factors slightly, but the overall results are qualitatively very similar. ## 02-19-2021 __Changes__: - Previously we did not exclude securities that are only traded over the counter. In the new version of the data set, we include an indicator column "exch_main" to exclude non-standard exchanges. In the US, the main exchanges are AMEX, NASDAQ and NYSE. Outside of the US, we exclude over the counter exchanges, stock connect exchanges in China and cross-country exchanges such as BATS Chi-X Europe. The documentation includes a full list of the excluded exchanges. - Included SIC, NAICS and GICS industry codes. __Impact__: - Replication rate: 84.0%. - Excluding non-standard exchanges mainly affected the US. By December 2019, the number of stocks in the US dropped from 5,256 to 4,102 (-22%) after adding the new 'exch_main' screen. The excluded securities are mainly tiny stocks traded over the counter, so the aggregate market cap only dropped by 2%. The change also mostly affected post 2000 data, because over the counter observations in Compustat are very rare before this point in time. - The change had a small effect outside of the US, because of our 'primary_sec' screen. It's very rare for Compustat to identify a security traded on a non-standard exchange as the primary security of a firm. - Because the changes mainly affected tiny stocks, our results did not change much. Across the 153 factors in the US, Developed and Emerging regions, the change in posterior monthly alpha ranged from -0.06% to +0.07. ## 02-15-2021 __Changes__: - A bug caused _ivol_ff3_21d_, _iskew_ff3_21d_, _ivol_hxz4_21d_ and _iskew_hxz4_21d_ to require 17 (ff3) and 18 (hxz) observations for a valid estimate. Consistent with our original intent, we now require at least 15 observations for a valid estimate. __Impact__: - Replication Rate: 84.0%. - The changes had a negligible effect on the affected factors. ## 02-01-2021 __Changes__: - Fixed a small bug in the bidask_hl() macro. - When creating asset pricing factors (FF and HXZ), we previously required at least 5 stocks in a sub-portfolio (e.g. small stocks with high BM) for the observation to be valid. This led to missing observation in the 1950's for small stocks with low bm. We lowered this requirement to at least 3 stocks. Furthermore, when creating asset pricing factors, we changed the breakpoints to be based on NYSE stocks in the US instead of non-microcap stocks. Outside of the US, breakpoints are still based on non-microcap stocks. __Impact__: - Replication Rate: 84.0% - The _bidaskhl_21d_ factor changed slightly but is still significantly negative in all regions. The US factor IR changed from -0.11 to -0.09. - The change in asset pricing factor generally didn't affect the results much. ## 01-25-2021 __Changes__: - Changed residual momentum characteristics (resff3_12_1 & resff3_6_1) to be scaled with the standard deviation of residuals consistent with Blitz, Huij and Mertens (2011). - Fixed error in creating _qmj_prof_. The issue was that the _oaccruals_at_ used the value instead of the z-score of ranks. This effectively meant that accruals didn't impact the profitability score. - Fixed error for annual seasonality characteristics (factor names starting with seas_ and ending with _an). There was a bug in the screening procedure which meant that the characteristic for one stock could use information from an unrelated stock. - Rounding issues when converting a .csv file to an excel file, caused the zero_trades_* variables to not have any decimals which made the turnover tie-breaker ineffective. - Standardized unexpected earnings (niq_su) and sales (saleq_su) is computed as the actual value minus the expected value (standardized by the standard deviation of this change). Before, the expected value was computed as the mean yearly change over the last 8 quarters added to the last quarterly value. Now the expected value is the same mean yearly change, but added to the quarterly value 4 quarters ago consistent with Jegadeesh and Livnat (2006). __Impact__: - Replication Rate: 84.0% - The change to the residual momentum variables, made them slightly weaker. As an example, the monthly OLS information ratio of the US resff3_12_1 factor dropped from 0.33 to 0.28. - The _qmj_prof_ change made _qmj_prof_ and _qmj_ slightly stronger. As an example, the monthly OLS information ratio of the US _qmj_prof_ factor increased from 0.16 to 0.22. - The seasonality fix didn't have a large qualitative impact for the US factors, but did have a large positive effect outside of the US. As an example, the OLS IR of the developed market _seas_11_15an_ factor changed from -0.06 to 0.11. - The zero_trades where missing in the developed market because of too few non-missing observations. The developed market zero trades factors are generally strong and the IR ranges from 0.07 to 0.20. Similarly, The Emerging market zero trades factors where slightly negative before. After, the factors are strong with IRs ranging from 0.14 to 0.17. The US market zero trades factors improved slightly. The IR of zero_trades_21d has the most notable increase from 0.05 to 0.09. - The standardized unexpected sales (saleq_su) variable went from a significant IR of 0.12 to an insignificant IR of 0.05. This explains the drop in the replication rate. On the other hand, niq_su increased from 0.11 to 0.19. ## 01-15-2021 __Changes__: - Base data set used in the first online version of Jensen, Kelly and Pedersen (2021). __Impact__: - Replication Rate: 84.9% ================================================ FILE: GlobalFactors/Cluster Labels.csv ================================================ characteristic,cluster age,Low Leverage aliq_at,Investment aliq_mat,Low Leverage ami_126d,Size at_be,Low Leverage at_gr1,Investment at_me,Value at_turnover,Quality be_gr1a,Investment be_me,Value beta_60m,Low Risk beta_dimson_21d,Low Risk betabab_1260d,Low Risk betadown_252d,Low Risk bev_mev,Value bidaskhl_21d,Low Leverage capex_abn,Debt Issuance capx_gr1,Investment capx_gr2,Investment capx_gr3,Investment cash_at,Low Leverage chcsho_12m,Value coa_gr1a,Investment col_gr1a,Investment cop_at,Quality cop_atl1,Quality corr_1260d,Seasonality coskew_21d,Seasonality cowc_gr1a,Accruals dbnetis_at,Seasonality debt_gr3,Debt Issuance debt_me,Value dgp_dsale,Quality div12m_me,Value dolvol_126d,Size dolvol_var_126d,Profitability dsale_dinv,Profit Growth dsale_drec,Profit Growth dsale_dsga,Profit Growth earnings_variability,Low Risk ebit_bev,Profitability ebit_sale,Profitability ebitda_mev,Value emp_gr1,Investment eq_dur,Value eqnetis_at,Value eqnpo_12m,Value eqnpo_me,Value eqpo_me,Value f_score,Profitability fcf_me,Value fnl_gr1a,Debt Issuance gp_at,Quality gp_atl1,Quality inv_gr1,Investment inv_gr1a,Investment iskew_capm_21d,Short-Term Reversal iskew_ff3_21d,Short-Term Reversal iskew_hxz4_21d,Short-Term Reversal ival_me,Value ivol_capm_21d,Low Risk ivol_capm_252d,Low Risk ivol_ff3_21d,Low Risk ivol_hxz4_21d,Low Risk kz_index,Seasonality lnoa_gr1a,Investment lti_gr1a,Seasonality market_equity,Size mispricing_mgmt,Investment mispricing_perf,Quality ncoa_gr1a,Investment ncol_gr1a,Debt Issuance netdebt_me,Low Leverage netis_at,Value nfna_gr1a,Debt Issuance ni_ar1,Debt Issuance ni_be,Profitability ni_inc8q,Quality ni_ivol,Low Leverage ni_me,Value niq_at,Quality niq_at_chg1,Profit Growth niq_be,Profitability niq_be_chg1,Profit Growth niq_su,Profit Growth nncoa_gr1a,Investment noa_at,Debt Issuance noa_gr1a,Investment o_score,Profitability oaccruals_at,Accruals oaccruals_ni,Accruals ocf_at,Profitability ocf_at_chg1,Profit Growth ocf_me,Value ocfq_saleq_std,Low Risk op_at,Quality op_atl1,Quality ope_be,Profitability ope_bel1,Profitability opex_at,Quality pi_nix,Seasonality ppeinv_gr1a,Investment prc,Size prc_highprc_252d,Momentum qmj,Quality qmj_growth,Quality qmj_prof,Quality qmj_safety,Quality rd_me,Size rd_sale,Low Leverage rd5_at,Low Leverage resff3_12_1,Momentum resff3_6_1,Momentum ret_1_0,Short-Term Reversal ret_12_1,Momentum ret_12_7,Profit Growth ret_3_1,Momentum ret_6_1,Momentum ret_60_12,Investment ret_9_1,Momentum rmax1_21d,Low Risk rmax5_21d,Low Risk rmax5_rvol_21d,Short-Term Reversal rskew_21d,Short-Term Reversal rvol_21d,Low Risk sale_bev,Quality sale_emp_gr1,Profit Growth sale_gr1,Investment sale_gr3,Investment sale_me,Value saleq_gr1,Investment saleq_su,Profit Growth seas_1_1an,Profit Growth seas_1_1na,Momentum seas_11_15an,Seasonality seas_11_15na,Seasonality seas_16_20an,Seasonality seas_16_20na,Accruals seas_2_5an,Seasonality seas_2_5na,Investment seas_6_10an,Seasonality seas_6_10na,Low Risk sti_gr1a,Seasonality taccruals_at,Accruals taccruals_ni,Accruals tangibility,Low Leverage tax_gr1a,Profit Growth turnover_126d,Low Risk turnover_var_126d,Profitability z_score,Low Leverage zero_trades_126d,Low Risk zero_trades_21d,Low Risk zero_trades_252d,Low Risk ================================================ FILE: GlobalFactors/GlobalFactors.Rproj ================================================ Version: 1.0 RestoreWorkspace: Default SaveWorkspace: Default AlwaysSaveHistory: Default EnableCodeIndexing: Yes UseSpacesForTab: Yes NumSpacesForTab: 2 Encoding: UTF-8 RnwWeave: Sweave LaTeX: pdfLaTeX ================================================ FILE: GlobalFactors/MD ================================================ ================================================ FILE: GlobalFactors/README.md ================================================ ## Overview This repository contains code that create a dataset of global stock returns and characteristics. The dataset was created for the paper [Is There a Replication Crisis in Finance?](https://onlinelibrary.wiley.com/doi/10.1111/jofi.13249) by Jensen, Kelly and Pedersen (2023). Please cite this paper if you are using the code or data: ``` @article{JensenKellyPedersen2023, author = {Jensen, Theis Ingerslev and Kelly, Bryan and Pedersen, Lasse Heje}, title = {Is There a Replication Crisis in Finance?}, journal = {The Journal of Finance}, volume = {78}, number = {5}, pages = {2465-2518}, year = {2023} } ``` Follow this [link](https://www.dropbox.com/sh/61j1v0sieq9z210/AACdJ68fs5_eT_eJMunwMBWia?dl=0) for a detailed documentation of the data sets. ## How to Generate Global Stock Returns and Stock Characteristics _Note: The data created in this section can be downloaded directly from WRDS ([link](https://wrds-www.wharton.upenn.edu/pages/get-data/contributed-data-forms/global-factor-data/))_ The .sas files construct the stock-level characteristics and factor portfolio returns for all countries. The code requires a connection to WRDS servers. Below we outline our preferred approach to creating the dataset: 1. Connect to the [SAS studio server hosted by WRDS](https://wrds-cloud.wharton.upenn.edu/SASStudio/index?locale=en_US). 2. Create a folder called _Global Data_ in your home directory and upload `main.sas`, `project_macros.sas`, `market_chars.sas`, `accounting_chars.sas`, `char_macros.sas`, and `ind_identification.sas` to this folder. 3. Create an empty folder in your institutions scratch folder. The scratch folder is located at "Sever Files and Folders/Files/scratch/\". 4. Open `main.sas`. 5. Replace line 8 with the path to the scratch folder created in step 3. 6. Run `main.sas`. For step 6, we suggest that you make a background submit. To do so, locate the `main.sas` file in your _Global Data_ folder, right-clik the file and select "Background Submit". Running the code will take around 24-48 hours. When the code has finished running, your scratch folder will contain a folder called _output_. This folder contains daily and monthly stocks returns, daily and monthly market returns, market equity breakpoints from NYSE and return breakpoints to use for winsorization. If `save_csv=1` on line 12, the folder will also hold the data country-by-country in .csv format. Importantly, these files only include the main observation of ordinary common stocks that are the primary securities of the underlying firm, and traded on a main exchange. To include all stocks, modify the 'save_main_data_csv()' macro. Note that the space in the scratch directory is shared across users in your institution. It's best practice to delete files after downloading what you need. Furthermore, files in the scratch directory is deleted after 7 days. ## How to Generate Global Factor Returns _Note: The data created in this section can be downloaded directly from [JKPfactors.com](https://jkpfactors.com/)_ The file `portfolio.R` generates country level factor returns based on the dataset generated by `main.sas`: 1. Run `main.sas`. 2. Save country level .csv files by running the macro `save_main_data_csv()` in `main.sas`. 3. Download the folder named _output_ to a local directory and unzip. 4. Open `portfolio.R`. 5. User defined inputs: 1. Replace the variable `data_path` with the path to the folder where you have unzipped the content of _output_. 2. Replace the variable `output_path` with the path to the folder where you wish to save the factor returns. 3. Replace the variable `legacy_path` with the path to the folder where you wish to save earlier versions of the factor output. If you don't wish to save earlier version, set the variabl to `NULL`. 4. The variable `countries` controls which countries to generate factor returns for. By default, it selects all the countries in `data_path`. 5. The variable `chars` controls which characteristics to use for creating factor returns. By default, it is set to the 153 characteristics from Jensen, Kelly and Pedersen (2022). Any column in the charactersitic dataset can be used to as the basis of a factor. 6. The variable `settings` controls the end date, number of portfolio, which breakpoints to use, the data source, whether to winsorize Compustat returns, the minimum amount of stocks for breakpoints to be valid and whether to create rank weighted (characteristic managed) portfolios in each of the 5 size groups. Default settings are the same as used in Jensen, Kelly and Pedersen (2022). 6. Run `portfolio.R`. ### Output **Files** - `pfs.csv`: We sort stocks into 3 portfolios based on non-microcap breakpoints. Portfolio 1 (3) has the stocks with the lowest (highest) value of the characteristic. - `hml.csv`: Long/short portfolios that are long stocks with high values of the underlying characteristics (portfolio 3 from pfs.csv) and short stocks with low values (portfolio 1 from pfs.csv). - `lms.csv`: Long/short portfolios based on hml.csv but with the signing convention used in Jensen, Kelly and Pedersen (2022). In particular, we sign factors so they are consistent with the literature. For example, we go long low asset growth stocks and short high asset growth stocks, becuase the literature generally finds that low asset growth stocks outperform. - `cmp.csv`: Rank-weighted (chracteristic managed) portfolios within mega, large, small, micro and nano cap stocks in the US. - `market_returns.csv`: Monthly market returns in all the countries we cover. - `Country Factors`: Folder with lms.csv in country-by-country files for easier usability. Countries are saved by their [ISO Alpha-3 codes](https://www.nationsonline.org/oneworld/country_code_list.htm). - `Regional Factors`: Folder with regional factors based on lms.csv and the method in Jensen, Kelly and Pedersen (2022). **Variables** - `excntry`: Country where securities are listed as ISO Alpha-3 codes. - `eom`: End-of-month of the month used to calculate returns. - `characteristic`: Name of characteristic, refer to table J.1 in Jensen, Kelly and Pedersen (2022). - `region`: Region/MSCI country development of included factors. - `size_grp`: Size group used to create the rank weighted factors. - `pf`: Portfolio identifier - `n`: Total number of stocks in the portfolio. - `n_stocks`: Total number of stocks in the long and short portfolio. - `n_stocks_min`: Minimum number of stocks in the long and short portfolio. For example, if the long portfolio has 10 stocks and the short portfolio has 40 stocks, n_stocks=50 and n_stocks_min=10. - `n_countries`: Number of countries included in regional portfolio. - `signal` (pfs.csv): Median characteristic value in the portfolio. - `signal` (hml.csv, lms.csv): Difference between the median characteristics in the long and short portfolio. - `signal_weighted`: Rank weighted signal. - `ret_ew`: Return with equal weights. - `ret_vw`: Return with value weights. - `ret_vw_cap`: Return with capped value weights as used in Jensen, Kelly and Pedersen (2022). - `ret_weighted`: Rank weighted returns. - `me_lag1`: Total market equity within a country at the begining of the month - `dolvol_lag1`: Total dollar volume traded within a country in the previous month. - `stocks`: Stocks included in the market portfolio. - `mkt_vw_lcl`: Value weighted market return in local currency. - `mkt_ew_lcl`: Equally weighted market return in local currency. - `mkt_vw`: Value weighted market return in USD. - `mkt_ew`: Equally weighted market return in USD. - `mkt_ew_exc`: Equally weighted market excess return in USD. - `mkt_vw_exc` (market_return.csv): Value weighted market excess return in USD. - `mkt_vw_exc` (Regional Factors): Market cap weighted average of the market excess return in USD for countries included in the regional portfolio. ================================================ FILE: GlobalFactors/accounting_chars.sas ================================================ *************************************************************************** * Characteritics to Extract *************************************************************************** ; /* Pure Accounting Based Characteristics */ %let acc_chars= /* Accounting Based Size Measures */ assets sales book_equity net_income enterprise_value /* 1yr Growth */ at_gr1 ca_gr1 nca_gr1 lt_gr1 cl_gr1 ncl_gr1 be_gr1 pstk_gr1 debt_gr1 sale_gr1 cogs_gr1 sga_gr1 opex_gr1 /* 3yr Growth */ at_gr3 ca_gr3 nca_gr3 lt_gr3 cl_gr3 ncl_gr3 be_gr3 pstk_gr3 debt_gr3 sale_gr3 cogs_gr3 sga_gr3 opex_gr3 /* 1yr Growth Scaled by Assets */ cash_gr1a inv_gr1a rec_gr1a ppeg_gr1a lti_gr1a intan_gr1a debtst_gr1a ap_gr1a txp_gr1a debtlt_gr1a txditc_gr1a coa_gr1a col_gr1a cowc_gr1a ncoa_gr1a ncol_gr1a nncoa_gr1a oa_gr1a ol_gr1a noa_gr1a fna_gr1a fnl_gr1a nfna_gr1a gp_gr1a ebitda_gr1a ebit_gr1a ope_gr1a ni_gr1a nix_gr1a dp_gr1a ocf_gr1a fcf_gr1a nwc_gr1a eqnetis_gr1a dltnetis_gr1a dstnetis_gr1a dbnetis_gr1a netis_gr1a fincf_gr1a eqnpo_gr1a tax_gr1a div_gr1a eqbb_gr1a eqis_gr1a eqpo_gr1a capx_gr1a /* 3yr Growth Scaled by Assets */ cash_gr3a inv_gr3a rec_gr3a ppeg_gr3a lti_gr3a intan_gr3a debtst_gr3a ap_gr3a txp_gr3a debtlt_gr3a txditc_gr3a coa_gr3a col_gr3a cowc_gr3a ncoa_gr3a ncol_gr3a nncoa_gr3a oa_gr3a ol_gr3a fna_gr3a fnl_gr3a nfna_gr3a gp_gr3a ebitda_gr3a ebit_gr3a ope_gr3a ni_gr3a nix_gr3a dp_gr3a ocf_gr3a fcf_gr3a nwc_gr3a eqnetis_gr3a dltnetis_gr3a dstnetis_gr3a dbnetis_gr3a netis_gr3a fincf_gr3a eqnpo_gr3a tax_gr3a div_gr3a eqbb_gr3a eqis_gr3a eqpo_gr3a capx_gr3a /* Investment */ capx_at rd_at /* Profitability */ gp_sale ebitda_sale ebit_sale pi_sale ni_sale nix_sale ocf_sale fcf_sale /* Profit Margins */ gp_at ebitda_at ebit_at fi_at cop_at /* Return on Assets */ ope_be ni_be nix_be ocf_be fcf_be /* Return on Book Equity */ gp_bev ebitda_bev ebit_bev fi_bev cop_bev /* Return on Invested Capital */ gp_ppen ebitda_ppen fcf_ppen /* Return on Physical Capital */ /* Issuance */ fincf_at netis_at eqnetis_at eqis_at dbnetis_at dltnetis_at dstnetis_at /* Equity Payout */ eqnpo_at eqbb_at div_at /* Accruals */ oaccruals_at oaccruals_ni taccruals_at taccruals_ni noa_at /* Capitalization/Leverage Ratios */ be_bev debt_bev cash_bev pstk_bev debtlt_bev debtst_bev debt_mev pstk_mev debtlt_mev debtst_mev /* Financial Soundness Ratios */ int_debtlt int_debt cash_lt inv_act rec_act ebitda_debt debtst_debt cl_lt debtlt_debt profit_cl ocf_cl ocf_debt lt_ppen debtlt_be fcf_ocf opex_at nwc_at /* Solvency Ratios */ debt_at debt_be ebit_int /* Liquidity Ratios */ cash_cl caliq_cl ca_cl inv_days rec_days ap_days cash_conversion /* Activity/Efficiency Ratio */ inv_turnover at_turnover rec_turnover ap_turnover /* Non-Recurring Items */ spi_at xido_at nri_at /* Miscalleneous */ adv_sale staff_sale rd_sale div_ni sale_bev sale_be sale_nwc tax_pi /* Balance Sheet Fundamentals to Market Equity */ be_me at_me cash_me /* Income Fundamentals to Market Equity */ gp_me ebitda_me ebit_me ope_me ni_me nix_me sale_me ocf_me fcf_me cop_me rd_me /* Equity Payout/issuance to Market Equity */ div_me eqbb_me eqis_me eqpo_me eqnpo_me eqnetis_me /* Debt Issuance to Market Enterprice Value */ dltnetis_mev dstnetis_mev dbnetis_mev /* Firm Payout/issuance to Market Enterpice Value */ netis_mev /* Balance Sheet Fundamentals to Market Enterprise Value */ at_mev be_mev bev_mev ppen_mev cash_mev /* Income/CF Fundamentals to Market Enterprise Value */ gp_mev ebitda_mev ebit_mev cop_mev sale_mev ocf_mev fcf_mev fincf_mev /* New Variables from HXZ */ ni_inc8q ppeinv_gr1a lnoa_gr1a capx_gr1 capx_gr2 capx_gr3 sti_gr1a niq_at niq_at_chg1 niq_be niq_be_chg1 saleq_gr1 rd5_at dsale_dinv dsale_drec dgp_dsale dsale_dsga saleq_su niq_su debt_me netdebt_me capex_abn inv_gr1 be_gr1a op_at pi_nix op_atl1 gp_atl1 ope_bel1 cop_atl1 at_be ocfq_saleq_std aliq_at aliq_mat tangibility eq_dur f_score o_score z_score kz_index intrinsic_value ival_me sale_emp_gr1 emp_gr1 cash_at earnings_variability ni_ar1 ni_ivol /* New Variables not in HXZ */ niq_saleq_std ni_emp sale_emp ni_at ocf_at ocf_at_chg1 roeq_be_std roe_be_std gpoa_ch5 roe_ch5 roa_ch5 cfoa_ch5 gmar_ch5 ; %put ### In total %nwords(&acc_chars.) characteristics will be created ###; ********************************************************************************************************************** * MACRO - Add Helper Vars to Standardized Compustat Accounting ********************************************************************************************************************* Description: The main functionality of this macro is to take the output of %compustat_accounting_data and add helper variables. These helper variables have two main functionalities: 1. They are used to expand coverage of a given variable 2. They are used to create variables not directly available from the accounting statements All added variables have a suffix of '_x'. ; %macro add_helper_vars(data=, out=); /* First ensure that the gap between two dates is always one month */ proc sql; create table __comp_dates1 as select gvkey, curcd, min(datadate) as start_date, max(datadate) as end_date from &data. group by gvkey, curcd; quit; %expand(data=__comp_dates1, out=__comp_dates2, id_vars=gvkey, start_date=start_date, end_date=end_date, freq='month', new_date_name=datadate); proc sql; create table __helpers1 as select a.gvkey, a.curcd, a.datadate, b.*, not missing(b.gvkey) as data_available from __comp_dates2 as a left join &data. as b on a.gvkey=b.gvkey and a.curcd=b.curcd and a.datadate=b.datadate; quit; proc sort nodupkey data=__helpers1; by gvkey curcd datadate; run; data __helpers2; set __helpers1; by gvkey curcd; retain count; if first.curcd then count = 1; else count = count+1; run; /* Create Helper Variables */ data &out.; set __helpers2; by gvkey curcd; /* Require Certain Variables to Be Positive */ array var_pos at sale revt dv che; do over var_pos; if var_pos<0 then var_pos = .; end; /* X Variables to Maximize Coverage */ *Income Statement; sale_x = coalesce(sale, revt); /* in NA sale has better coverage in Global revt has better coverage. They are the same though*/ gp_x = coalesce(gp, sale_x-cogs); /*Gross Profit*/ opex_x = coalesce(xopr, cogs+xsga); /* Operating Expenses */ ebitda_x = coalesce(ebitda, oibdp, sale_x-opex_x, gp_x-xsga); /*Operating Income Before Depreciation*/ ebit_x = coalesce(ebit, oiadp, ebitda_x-dp); /*Operating Income Before Depreciation*/ op_x = ebitda_x + coalesce(xrd, 0); /* Operating Profit ala Ball et al (2015)*/ ope_x = ebitda_x-xint; /* Operating Profit to Equity ala FF*/ pi_x = coalesce(pi, ebit_x-xint+coalesce(spi,0)+coalesce(nopi,0)); /* Interest Income is included in NOPI*/ xido_x = coalesce(xido, xi+coalesce(do, 0)); ni_x = coalesce(ib, ni-xido_x, pi_x - txt - coalesce(mii, 0)); nix_x = coalesce(ni, ni_x+coalesce(xido_x, 0), ni_x + xi + do); fi_x = nix_x+xint; /* Firm income i.e. return to equity and debt holders */ div_x = coalesce(dvt, dv); /* See [1] */ * Cash Flow Statement; eqbb_x = sum(prstkc, purtshr); /* Equity Buyback is mainly PRSTKC in NA and PURTSHR in GLOBAL. Using sum() means that any of the two inputs are allowed to be missing */ eqis_x = sstk; /* Equity Issuance is SSTK which is common+preferred Stocks.*/ eqnetis_x = sum(eqis_x,-eqbb_x); /* Net Equity Issuance. Using sum means that the variable will be computed as long as one of the inputs are non-missing */ eqpo_x = div_x+eqbb_x; /* Net Equity Payout= Div+Buyback-Issuance*/ eqnpo_x = div_x-eqnetis_x; /* Net Equity Payout= Div+Buyback-Issuance*/ dltnetis_x = coalesce(sum(dltis,-dltr), ltdch, dif12(dltt)); /* Net Long Term Debt issuance. GLOBAL firms only have LTDCH, NA firms only have dltis and DLTR. If cash flow items are missing. Approximate by the change in long term book debt */ if missing(dltis) and missing(dltr) and missing(ltdch) and count<=12 then dltnetis_x = .; dstnetis_x = coalesce(dlcch, dif12(dlc)); /* Prefer dlcch. If this is missing, approximate by the change in short term book debt */ if missing(dlcch) and count<=12 then dstnetis_x = .; dbnetis_x = sum(dstnetis_x, dltnetis_x); netis_x = eqnetis_x+dbnetis_x; /* I require that both equity and debt issuance are available */ fincf_x = coalesce(fincf, netis_x-dv+coalesce(fiao, 0)+coalesce(txbcof, 0)); /* https://wrds-web.wharton.upenn.edu/wrds/support/Data/_001Manuals%20and%20Overviews/_001Compustat/_001North%20America%20-%20Global%20-%20Bank/_000dataguide/index.cfm?_ga=2.179697184.565214677.1585224362-1543109641.1544636729 */ * Balance Sheet Statement; debt_x = sum(dltt, dlc); /* This greatly expands coverage */ pstk_x = coalesce(pstkrv, pstkl, pstk); /* Value of Preferred Stock*/ seq_x = coalesce(seq, ceq+coalesce(pstk_x, 0), at-lt); at_x = coalesce(at, seq_x + dltt + coalesce(lct, 0) + coalesce(lo, 0) + coalesce(txditc, 0)); /* at is only available yearly in the beginning of the fundq data whereas seq and dltt is available on a quarterly basis from the beginning */ ca_x = coalesce(act, rect+invt+che+aco); /* Current Assets */ nca_x = at_x-ca_x; /* Non-Current Assets */ cl_x = coalesce(lct, ap+dlc+txp+lco); /* Current Liabilities */ ncl_x = lt-cl_x; netdebt_x = debt_x - coalesce(che, 0); /* Net Debt for calculating Enterprise Value */ txditc_x = coalesce(txditc, sum(txdb, itcb)); be_x = seq_x+coalesce(txditc_x, 0)-coalesce(pstk_x, 0); bev_x = coalesce(icapt+coalesce(dlc, 0)-coalesce(che, 0), netdebt_x+seq_x+coalesce(mib, 0)); coa_x = ca_x - che; /* Operating (non cash) current assets */ col_x = cl_x-coalesce(dlc, 0); /* Operating Current Liabilities */ cowc_x = coa_x-col_x; /* Current Operating Working Capital */ ncoa_x = at_x-ca_x-coalesce(ivao, 0); /* Non-Current Operating Assets */ ncol_x = lt-cl_x-dltt; /* Non-Current Operating Liabilities */ nncoa_x = ncoa_x-ncol_x; /* Net Non-Current Operatng Assets */ fna_x = coalesce(ivst,0)+coalesce(ivao,0); /* Financial Assets */ fnl_x = debt_x+coalesce(pstk_x,0); /* Financial Liabilities */ nfna_x = fna_x-fnl_x; /* Net Financial Assets */ oa_x = coa_x+ncoa_x; /* Operating Assets */ ol_x = col_x+ncol_x; /* Operating Liabilities */ noa_x = oa_x-ol_x; /* Net Operating Assets*/ lnoa_x = ppent+intan+ao-lo+dp; /* Long-term NOA (from HXZ A.3.5)*/ caliq_x = coalesce(ca_x-invt, che+rect); /* Liquid current assets use for quick ratio*/ nwc_x = ca_x-cl_x; ppeinv_x = ppegt + invt; * Should be moved to create_chars as it is a helper variables; aliq_x = che + 0.75 * coa_x + 0.5 * (at_x - ca_x - coalesce(intan, 0)); /* From Ortiz-Molina and Phillips (2014). Don't subtract gdwl since this is already included intangibles */ array var_bs be_x bev_x; do over var_bs; if var_bs<=0 then var_bs = .; end; /* Accruals + OCF/FCF + Cash Based Operating Profit*/ oacc_x = coalesce(ni_x-oancf, dif12(cowc_x)+dif12(nncoa_x)); /* Operating Accruals: Difference between Accounting Earnings and Operating Cash Flow. [2] */ tacc_x = oacc_x+dif12(nfna_x); /* Total Accruals = Accural Earnings - Cash Earnings = Change in Non-Cash Assets - Change in Liabilities. */ if count<=12 then do; oacc_x = .; tacc_x = .; end; ocf_x = coalesce(oancf, ni_x-oacc_x, ni_x + dp - coalesce(wcapt, 0)); fcf_x = ocf_x-capx; /* Note that this does not include funds from financing activities */ cop_x = ebitda_x + coalesce(xrd, 0) - oacc_x; /* Cash Based Operating Profitability (Gerakos et al, 2016) add R&D while subtracting accruals */ drop count; run; proc delete data= __comp_dates1 __comp_dates2 __helpers1 __helpers2; run; %mend add_helper_vars; ********************************************************************************************************************** * MACRO - Compustat Accounting Data Standardized ********************************************************************************************************************* Description: The main functionality of this macro is to create accounting datasets which are comparable across frequency (quarterly/annual) and geography (North America/Global). To make the data comparable across frequency, we modify the quarterly data by: - Quarterize year-to-date variables. - Take the sum over the last 4 quarters for income and cash flow variables - Change name to be consistent with the annual data To make the data comparable across geography, we modify the global data by: - Create columns available in the North American dataset in the global dataset. If possible we infer the values from available columns. Otherwise we just set to missing. - If specified, we change all data to USD. ; %macro standardized_accounting_data(coverage=, convert_to_usd=, me_data=, include_helpers_vars=1, start_date='01JAN1950'd); /* Coverage in ('na', 'global', 'world'), convert_to_usd in (0,1), include_helpers_vars in (0,1)*/ /* Compustat Accounting Vars to Extract */ %let avars_inc = sale revt gp ebitda oibdp ebit oiadp pi ib ni mii cogs xsga xopr xrd xad xlr dp xi do xido xint spi nopi txt dvt ; %let avars_cf = /* Operating */ oancf ibc dpc xidoc capx ibc dpc wcapt /* Financing */ fincf fiao txbcof ltdch dltis dltr dlcch purtshr prstkc sstk dv dvc ; %let avars_bs = /* Assets */ at act aco che invt rect ivao ivst ppent ppegt intan ao gdwl re /* Liabilities */ lt lct dltt dlc txditc txdb itcb txp ap lco lo seq ceq pstkrv pstkl pstk mib icapt ; * Variables in avars_other are not measured in currency units, and only available in annual data; %let avars_other = emp; %let avars= &avars_inc. &avars_cf. &avars_bs.; /* fdate and pdate. Unfortunately only available from 2007. RDQ is available further back for NA but should probably use 10K dates instead???*/ %put INCOME STATEMENT: %nwords(&avars_inc.) || CASH FLOW STATEMENT: %nwords(&avars_cf.) || BALANCE SHEET: %nwords(&avars_bs.) || OTHER: %nwords(&avars_other.); proc sql noprint; select distinct lowcase(name) into :qvars_q separated by ' ' from dictionary.columns where libname='COMP' and memname in ('FUNDQ', 'G_FUNDQ') and memtype='DATA' and findw(lowcase("&avars."),substr(lowcase(name),1,length(name)-1))>0 and name like "%nrstr(%%)q"; /* Quarterly names ending with q */ quit; proc sql noprint; select distinct lowcase(name) into :qvars_y separated by ' ' from dictionary.columns where libname='COMP' and memname in ('FUNDQ', 'G_FUNDQ') and memtype='DATA' and findw(lowcase("&avars."),substr(lowcase(name),1,length(name)-1))>0 and name like "%nrstr(%%)y"; /* Quarterly names ending with y (ytd_variables)*/ quit; %let qvars = &qvars_q. &qvars_y.; * In International but not in NA: dvtq dvty purtshr purtshry ltdch ltdchy LOC?; %if &coverage. = 'global' or &coverage. = 'world' %then %do; %let aname = __gfunda; %let qname = __gfundq; %let compcond=indfmt in ('INDL', 'FS') and datafmt='HIST_STD' and popsrc='I' and consol='C'; data g_funda1; set comp.g_funda; where &compcond. and datadate>=&start_date.; source = 'GLOBAL'; format source $char6.; * Variables Not Available in G_FUNDA with Replacement; ni = ib+coalesce(xi,0)+coalesce(do,0); /*https://wrds-www.wharton.upenn.edu/pages/support/support-articles/compustat/global/ni-net-income-variable/?_ga=2.199276522.59544135.1581901991-1543109641.1544636729 */ * Variables Not Available in G_FUNDA without Replacement; gp = .; pstkrv = .; /*captured by pstk_x*/ pstkl = .; /*captured by pstk_x*/ itcb = .; /*Only used as substitute for txditc (deffered tax and investment credit)*/ xad = .; txbcof = .; /* Part of FINCF but will automatically be set to zero*/ keep gvkey datadate indfmt curcd source &avars. &avars_other.; run; proc sql; create table &aname. as select * from g_funda1 group by gvkey, datadate having count(*)=1 or (count(*)=2 and indfmt='INDL'); /* If the accouting report is available inb both an industrial and financial format, choose financial format (happens very rarely in the international data)*/ alter table &aname. drop indfmt; quit; data g_fundq1; set comp.g_fundq; where &compcond. and datadate>=&start_date.; source = 'GLOBAL'; format source $char6.; * Variables Not Available in G_FUNDQ with Replacement; niq = ibq+coalesce(xiq, 0); /*Discontinued Operations is not available in g_fundq*/ ppegtq = ppentq+dpactq; /* See [3] */ * Variables Not Available in G_FUNDQ without Replacement; icaptq = .; niy = .; txditcq = .; txpq = .; xidoq = .; xidoy = .; xrdq = .; xrdy = .; txbcofy = .; /* Part of FINCF but will automatically be set to zero*/ keep gvkey datadate indfmt fyr fyearq fqtr curcdq source &qvars.; run; proc sql; create table &qname. as select * from g_fundq1 group by gvkey, datadate having count(*)=1 or (count(*)=2 and indfmt='INDL'); /* If the accouting report is available inb both an industrial and financial format, choose financial format (happens very rarely in the international data)*/ alter table &qname. drop indfmt; quit; %end; %if &coverage. = 'na' or &coverage. = 'world' %then %do; %let aname = __funda; %let qname = __fundq; %let compcond=indfmt='INDL' and datafmt='STD' and popsrc='D' and consol='C'; data &aname.; set comp.funda; where &compcond. and datadate>=&start_date.; source = 'NA'; format source $char6.; keep gvkey datadate curcd source &avars. &avars_other.; run; data &qname.; set comp.fundq; where &compcond. and datadate>=&start_date.; source = 'NA'; format source $char6.; keep gvkey datadate fyr fyearq fqtr curcdq source &qvars.; run; %end; %if &coverage. = 'world' %then %do; %let aname = __wfunda; %let qname = __wfundq; data &aname.; set __gfunda __funda; run; data &qname.; set __gfundq __fundq; run; /*proc delete data= __gfunda __gfundq __funda __fundq; run;*/ %end; /* If &convert_to_usd=1 then convert everything to USD otherwise keep as local currency*/ %if &convert_to_usd.=1 %then %do; %compustat_fx(out=fx); proc sql; create table __tempa as select a.*, b.fx from &aname. as a left join fx as b on a.datadate=b.date and a.curcd=b.curcdd; create table __tempq as select a.*, b.fx from &qname. as a left join fx as b on a.datadate=b.date and a.curcdq=b.curcdd; quit; data __compa1; set __tempa; array var &avars.; do over var; var = var*fx; end; curcd = 'USD'; drop fx; run; data __compq1; set __tempq; array var &qvars.; do over var; var = var*fx; end; curcdq = 'USD'; drop fx; run; proc delete data=fx __tempa __tempq; %end; %else %do; *Rename Data; proc sql; create table __compa1 as select * from &aname.; create table __compq1 as select * from &qname.; quit; %end; proc delete data= &aname. &qname.; /* Change Quarterly Data to Be Comparable to Annual Data */ %QUARTERIZE(inset=__compq1, outset=__compq2, idvar=gvkey fyr, fyear=fyearq, fqtr=fqtr); /* Quarterize the YTD flow accounting variables */ proc sort data=__compq2 nodupkey; by gvkey fyr fyearq fqtr; run; /*THEIS: 0 Deleted in the US*/ %macro ttm(var); (&var + lag1(&var) + lag2(&var) + lag3(&var)) %mend; /* Note that ttm will return missing if either of the lags are missing. This is the behavior we want. */ %macro temp(); /* Prepare quarterly data: if quarterly Compustat variable is missing, replace with quarterized version*/ data __compq3; set __compq2; by gvkey fyr fyearq fqtr; /* Replace missing &var.q with quarterized version &var.y_q*/ %do i=1 %to %nwords(&qvars_y.); %let var_ytd = %scan(&qvars_y., &i, %str(' ')); %let var = %sysfunc(prxchange(s/y$//, 1, &var_ytd.)); if missing(&var.q) then &var.q = &var.y_q; drop &var.y_q; %end; /* Create Quarterly Variables to Keep */ ni_qtr = ibq; sale_qtr = saleq; ocf_qtr = coalesce(oancfq, ibq + dpq - coalesce(wcaptq, 0)); *Cumulate Income/CF Items Over 4 Quarters (This should be made automatic at some point; %let yrl_vars = cogsq xsgaq xintq dpq txtq xrdq dvq spiq saleq revtq cogsq xoprq oibdpq oiadpq ibq niq xidoq nopiq miiq piq xiq xidocq capxq oancfq ibcq dpcq wcaptq prstkcq sstkq purtshrq dsq dltrq ltdchq dlcchq fincfq fiaoq txbcofq dvtq; %do i=1 %to %nwords(&yrl_vars.); %let var_yrl = %scan(&yrl_vars., &i, %str(' ')); %let var_yrl_name = %sysfunc(prxchange(s/q$//, 1, &var_yrl.)); &var_yrl_name. = %ttm(&var_yrl.); if (gvkey^=lag3(gvkey) or fyr^=lag3(fyr) or curcdq^=lag3(curcdq) or %ttm(fqtr)^=10) then &var_yrl_name. = .; if missing(&var_yrl_name.) and fqtr=4 then &var_yrl_name. = &var_yrl_name.y; * If financial quarter is 4, the ytd variable is yearly; drop &var_yrl. &var_yrl_name.y; %end; * Rename All Other (Balance Sheet and CURCDQ) Items to Facilitate Merge (This should be made automatic at some point); %let bs_vars = seqq ceqq pstkq icaptq mibq gdwlq req atq actq invtq rectq ppegtq ppentq aoq acoq intanq cheq ivaoq ivstq ltq lctq dlttq dlcq txpq apq lcoq loq txditcq txdbq; %do i=1 %to %nwords(&bs_vars.); %let var_bs = %scan(&bs_vars., &i, %str(' ')); %let var_bs_name = %sysfunc(prxchange(s/q$//, 1, &var_bs.)); rename &var_bs. = &var_bs_name.; %end; rename curcdq = curcd; run; %mend; %temp(); /* Ensure One Obs pr. Datadate */ proc sort data=__compq3 nodupkey; by gvkey datadate fyr; run; data __compq4; set __compq3; by gvkey datadate; if last.datadate; drop fyr fyearq fqtr; run; /*THEIS: US->1432 Observations are deleted in this step*/ /* Add empty quarterly variables to annual data */ data __compa2; set __compa1; ni_qtr = .; sale_qtr = .; ocf_qtr = .; run; /* Add Market Equity at Fiscal End */ proc sql; create table __me_data as select distinct gvkey, eom, me_company as me_fiscal from &me_data. where not missing(gvkey) and primary_sec=1 and not missing(me_company) and common=1 and obs_main=1 /* Notice, exch_main is not a requirement */ group by gvkey, eom having me_company=max(me_company); quit; proc sql; create table __compa3 as select a.*, b.me_fiscal from __compa2 as a left join __me_data as b on a.gvkey = b.gvkey and a.datadate = b.eom; quit; proc sql; create table __compq5 as select a.*, b.me_fiscal from __compq4 as a left join __me_data as b on a.gvkey = b.gvkey and a.datadate = b.eom; quit; /* Include Helper Variables */ %if &include_helpers_vars.=1 %then %do; %let qdata = __compq6; %let adata = __compa4; %add_helper_vars(data = __compq5, out=&qdata.); %add_helper_vars(data = __compa3, out=&adata.); proc delete data=__compq5 __compa3; run; %end; %else %do; %let qdata = __compq5; %let adata = __compa3; %end; /* Output */ proc sort data= &adata. out=acc_std_ann nodupkey; by gvkey datadate; run; proc sort data= &qdata. out=acc_std_qtr nodupkey; by gvkey datadate; run; proc delete data=__compq1 __compq2 __compq3 __compq4 __compa1 __compa2 &qdata. &adata.; run; %mend standardized_accounting_data; ********************************************************************************************************************** * MACRO - Create Accounting Characteristics from Compustat Standardized Data ********************************************************************************************************************* Description: The main functionality of this macro is to take the output from %standardized_accounting_data and create characteristics that require accounting data. !!! When we are satisfied with the data, we should really include labels for all accounting characteristics !!! ; %macro create_acc_chars(data=, out=, lag_to_public=, max_data_lag=, __keep_vars=, me_data=, suffix=); /* Helper Macros */ %macro mean_year(var); mean(&var, lag12(&var)) %mend; /* Note that this function will still return a value even if one of the lags are missing. To change this do (var+lag12(var))/2. Could also consider if we should take the average of all observable observations? ie var+lag1(var)+lag2(var)...+lag12(var)*/ %macro apply_to_lastq(x=, _qtrs=, func=); /* The Macro Below is a Generic way of creating lag from current to &n*/ %let mv = &func.(&x.; %do _i=1 %to &_qtrs.-1; %let mv = &mv., lag%eval(&_i.*3)(&x.); %end; %let mv = &mv.); &mv.; %mend apply_to_lastq; %macro apply_to_lasty(x=, yrs=, func=); /* The Macro Below is a Generic way of creating ANNUAL lags from current to &n */ %let mv = &func.(&x.; %do _i=1 %to &yrs.-1; %let mv = &mv., lag%eval(&_i.*12)(&x.); %end; %let mv = &mv.); &mv.; %mend apply_to_lasty; /* Start Procedure */ proc sort data=&data. out=__chars3; by gvkey curcd datadate; run; /* deleted _chars1 and __chars2 steps*/ data __chars4; set __chars3; by gvkey curcd; retain count; if first.curcd then count = 1; else count = count+1; run; /* Create Accounting Characteristics */ data __chars5; set __chars4; by gvkey curcd; /* Accounting Based Size Measures */ assets = at_x; sales = sale_x; book_equity = be_x; net_income = ni_x; /* Growth Characteristics */ %let growth_vars = at_x ca_x nca_x /* Assets - Aggregated */ lt cl_x ncl_x /* Liabilities - Aggregated */ be_x pstk_x debt_x /* Financing Book Values */ sale_x cogs xsga opex_x /* Sales and Operating Costs */ capx invt ; * 1yr Growth; %do i=1 %to %nwords(&growth_vars.); %let var_gr1 = %scan(&growth_vars., &i, %str(' ')); %var_growth(var_gr=&var_gr1., horizon=12); %end; * 3yr Growth; %do i=1 %to %nwords(&growth_vars.); %let var_gr3 = %scan(&growth_vars., &i, %str(' ')); %var_growth(var_gr=&var_gr3., horizon = 36); %end; /* Change Scaled by Asset Characteristics */ %let ch_asset_vars = che invt rect ppegt ivao ivst intan /* Assets - Individual Items */ dlc ap txp dltt txditc /* Liabilities - Individual Items*/ coa_x col_x cowc_x ncoa_x ncol_x nncoa_x /* Operating Assets/Liabilities */ oa_x ol_x /* Operating Assets/Liabilities */ fna_x fnl_x nfna_x /* Financial Assets/Liabilities */ gp_x ebitda_x ebit_x ope_x ni_x nix_x dp /* Income Statement*/ fincf_x ocf_x fcf_x nwc_x /* Aggreagted Cash Flow */ eqnetis_x dltnetis_x dstnetis_x dbnetis_x netis_x /* Financing Cash Flow */ eqnpo_x txt /* Tax Change */ eqbb_x eqis_x div_x eqpo_x /* Financing Cash Flow */ capx be_x ; * 1yr Change Scaled by Assets; %do i=1 %to %nwords(&ch_asset_vars.); %let var_gr1a = %scan(&ch_asset_vars., &i, %str(' ')); %chg_to_assets(var_gra = &var_gr1a., horizon = 12); %end; * 3yr Change Scaled by Assets; %do i=1 %to %nwords(&ch_asset_vars.); %let var_gr3a = %scan(&ch_asset_vars., &i, %str(' ')); %chg_to_assets(var_gra = &var_gr3a., horizon = 36); %end; /* Investment Measure */ capx_at = capx/at_x; rd_at = xrd/at_x; /* Non-Recurring Items */ spi_at = spi/at_x; xido_at = xido_x/at_x; nri_at = (spi+xido_x)/at_x; /* Non-Recurring Items */ /*Profitability Ratios and Rates of Return*/ * Profit Margins; gp_sale = gp_x/sale_x; /* Gross Profit Margin*/ ebitda_sale = ebitda_x/sale_x; /* Operating Profit Margin before Depreciation */ ebit_sale = ebit_x/sale_x; /* Operating profit Margin after Depreciation */ pi_sale = pi_x/sale_x; /* Pretax Profit Margin */ ni_sale = ni_x/sale_x; /* Net Profit Margin Before XI */ nix_sale = ni/sale_x; /* Net Profit Margin */ ocf_sale = ocf_x/sale_x; /* Cash Flow Margin */ fcf_sale = fcf_x/sale_x; * Return on Assets; gp_at = gp_x/at_x; ebitda_at = ebitda_x/at_x; ebit_at = ebit_x/at_x; fi_at = fi_x/at_x; cop_at = cop_x/at_x; ni_at = ni_x/at_x; * Return on Book Equity; ope_be = ope_x/be_x; ni_be = ni_x/be_x; nix_be = nix_x/be_x; ocf_be = ocf_x/be_x; fcf_be = fcf_x/be_x; * Return on Invested Book Capital; gp_bev = gp_x/bev_x; ebitda_bev = ebitda_x/bev_x; ebit_bev = ebit_x/bev_x; /* Pre tax Return on Book Enterprise Value */ fi_bev = fi_x/bev_x; /* ROIC */ cop_bev = cop_x/bev_x; /* Cash Based Operating Profit to Invested Capital */ * Return on Physical Capital; gp_ppen = gp_x/ppent; ebitda_ppen = ebitda_x/ppent; fcf_ppen = fcf_x/ppent; * Issuance Variables; fincf_at = fincf_x / at_x; netis_at = netis_x / at_x; eqnetis_at = eqnetis_x / at_x; eqis_at = eqis_x / at_x; dbnetis_at = dbnetis_x / at_x; dltnetis_at = dltnetis_x / at_x; dstnetis_at = dstnetis_x / at_x; /* Equity Payout */ eqnpo_at = eqnpo_x / at_x; eqbb_at = eqbb_x / at_x; div_at = div_x / at_x; * Accruals; oaccruals_at = oacc_x/at_x; /* Operating Accruals */ oaccruals_ni = oacc_x/abs(nix_x); /* Percent Operating Accruals */ taccruals_at = tacc_x/at_x; /* Total Accruals */ taccruals_ni = tacc_x/abs(nix_x); /* Percent Total Accruals */ noa_at = noa_x/lag12(at_x); /* Net Operating Asset to Total Assets*/ if count <= 12 or lag12(at_x) <= 0 then do; noa_at = .; end; /*Capitalization/Leverage Ratios Book*/ be_bev = be_x/bev_x; /* Common Equity as % of Book Enterprise Value*/ debt_bev = debt_x/bev_x; /* Total Debt as % of Book Enterprise Value*/ cash_bev = che/bev_x; /* Cash and Short-Term Investments to Book Enterprise Value */ pstk_bev = pstk_x/bev_x; /* Prefered Stock to Book Enterprise Value */ debtlt_bev = dltt/bev_x; /* Long-term debt as % of Book Enterprise Value */ debtst_bev = dlc/bev_x; /* Short-term debt as % of Book Enterprise Value */ /*Financial Soundness Ratios*/ int_debt = xint/debt_x; /* Interest as % of average total debt*/ int_debtlt = xint/dltt; /* Interest as % of average long-term debt*/ ebitda_debt = ebitda_x/debt_x; /* Ebitda to total debt*/ profit_cl = ebitda_x/cl_x; /* Profit before D&A to current liabilities*/ ocf_cl = ocf_x/cl_x; /* Operating cash flow to current liabilities*/ ocf_debt = ocf_x/debt_x; /* Operating cash flow to total debt*/ cash_lt = che/lt; /* Cash balance to Total Liabilities*/ inv_act = invt/act; /*inventory as % of current assets*/ rec_act = rect/act; /*receivables as % of current assets*/ debtst_debt = dlc/debt_x; /*short term term as % of total debt*/ cl_lt = cl_x/lt; /*current liabilities as % of total liabilities*/ debtlt_debt = dltt/debt_x; /*long-term debt as % of total liabilities*/ lt_ppen = lt/ppent; /*total liabilities to total tangible assets*/ debtlt_be = dltt/be_x; /*long-term debt to book equity*/ opex_at = opex_x/at_x; /* Operating Leverage ala Novy-Marx (2011) */ nwc_at = nwc_x/at_x; if ocf_x>0 then fcf_ocf = fcf_x/ocf_x; /*Free Cash Flow/Operating Cash Flow*/ /*Solvency Ratios*/ debt_at = debt_x/at_x; /*Debt-to-assets*/ debt_be = debt_x/be_x; /*debt to shareholders' equity ratio*/ ebit_int = ebit_x/xint; /*interest coverage ratio*/ /*Liquidity Ratios*/ inv_days = %mean_year(invt)/cogs * 365; /* Days Inventory Outstanding */ rec_days = %mean_year(rect)/sale_x * 365; /* Days Sales Outstanding */ ap_days = %mean_year(ap)/cogs * 365; /* Days Accounts Payable Outstanding */ if count<=12 then do; array var_liq inv_days rec_days ap_days; do over var_liq; var_liq=.; end; end; cash_conversion = inv_days + rec_days - ap_days; /* Cash Conversion Cycle*/ if cash_conversion<0 then cash_conversion =.; if cl_x>0 then do; cash_cl = che/cl_x; /* Cash Ratio*/ caliq_cl = caliq_x/cl_x; /* Quick Ratio (acid test)*/ ca_cl = ca_x/cl_x; /* Current Ratio*/ end; /*Activity/Efficiency Ratios*/ inv_turnover = cogs/%mean_year(invt); /* Inventory Turnover */ at_turnover = sale_x/%mean_year(at_x); /* Asset Turnover */ rec_turnover = sale_x/%mean_year(rect); /* Receivables Turnover */ ap_turnover = (cogs+dif12(invt))/%mean_year(ap); /* Account Payables Turnover */ if count<=12 then do; array var_turn inv_turnover at_turnover rec_turnover ap_turnover; do over var_turn; var_turn=.; end; end; /*Miscallenous Ratios*/ adv_sale = xad/sale_x; /*advertising as % of sales*/ staff_sale = xlr/sale_x; /*labor expense as % of sales*/ sale_bev = sale_x/bev_x; /*sale per $ Book Enterprise Value*/ rd_sale = xrd/sale_x; sale_be = sale_x/be_x; /*sales per $ total stockholders' equity*/ if coalesce(nix_x, ni_x)>0 then div_ni = div_x/nix_x; /*Dividend payout ratio. THEIS: I added ib as a possibility*/ if nwc_x>0 then sale_nwc = sale_x/nwc_x; /*sales per $ working capital*/ if pi_x>0 then tax_pi = txt/pi_x; /*effective tax rate*/ /* NEW VARIABLES */ cash_at = che / at_x; if at_x <= 0 then cash_at = .; * Employees based variables; ni_emp = ni_x / emp; if emp <= 0 then ni_emp = .; sale_emp = sale_x / emp; if emp <= 0 then sale_emp = .; sale_emp_gr1 = sale_emp / lag12(sale_emp) - 1; /* Labor force efficiency */ if count <= 12 or lag12(sale_emp) <= 0 then sale_emp_gr1 = .; emp_gr1 = (emp - lag12(emp)) / (0.5 * emp + 0.5 * lag12(emp)); if count <= 12 or emp_gr1 = 0 or (0.5 * emp + 0.5 * lag12(emp)) = 0 then emp_gr1 = .; * Number of Consecutive Earnings Increases; ni_inc = ni_x > lag12(ni_x); if missing(ni_x) or missing(lag12(ni_x)) then ni_inc = .; ni_inc8q = 0; no_decrease = 1; %do q = 0 %to 7; %let ql = %sysevalf(&q.*3); if lag&ql.(ni_inc) = 1 and no_decrease = 1 then ni_inc8q = ni_inc8q + 1; else no_decrease = 0; %end; n_ni_inc = %apply_to_lastq(x = not missing(ni_inc), _qtrs = 8, func = sum); if missing(ni_inc) or n_ni_inc ^= 8 or count < 33 then ni_inc8q = .; drop no_decrease n_ni_inc; * 1yr Change Scaled by Lagged Assets; %let ch_asset_lag_vars = noa_x ppeinv_x ; %do i=1 %to %nwords(&ch_asset_lag_vars.); %let var_gr1al = %scan(&ch_asset_lag_vars., &i, %str(' ')); %let name_gr1al = %sysfunc(tranwrd(&var_gr1al, _x, %str())); /* Remove '_x' from var name */ &name_gr1al._gr1a = (&var_gr1al-lag12(&var_gr1al))/lag12(at_x); if count<=12 or lag12(at_x)<=0 then &name_gr1al._gr1a = .; %end; * 1yr Change Scaled by Average Assets; %let ch_asset_avg_vars = lnoa_x ; %do i=1 %to %nwords(&ch_asset_avg_vars.); %let var_gr1aa = %scan(&ch_asset_avg_vars., &i, %str(' ')); %let name_gr1aa = %sysfunc(tranwrd(&var_gr1aa, _x, %str())); /* Remove '_x' from var name */ &name_gr1aa._gr1a = (&var_gr1aa-lag12(&var_gr1aa))/(at_x + lag12(at_x)); if count<=12 or (at_x + lag12(at_x))<=0 then &name_gr1aa._gr1a = .; %end; * CAPEX growth over 2 years; %var_growth(var_gr=capx, horizon=24); * Quarterly Profitability Measures; saleq_gr1 = sale_qtr / lag12(sale_qtr) - 1; if count <= 12 or lag12(sale_qtr) < 0 then saleq_gr1 = .; niq_be = ni_qtr / lag3(be_x); if count <= 3 or lag3(be_x) < 0 then niq_be = .; niq_at = ni_qtr / lag3(at_x); if count <= 3 or lag3(at_x) < 0 then niq_at = .; niq_be_chg1 = niq_be - lag12(niq_be); niq_at_chg1 = niq_at - lag12(niq_at); if count <= 12 then do; niq_be_chg1 = .; niq_at_chg1 = .; end; * R&D capital-to-assets; rd5_at = (xrd + lag12(xrd)*0.8 + lag24(xrd)*0.6 + lag36(xrd)*0.4 + lag48(xrd)*0.2) / at_x; if count <= 48 or at_x <= 0 then rd5_at = .; * Abarbanell and Bushee (1998); %chg_to_exp(var_ce = sale_x); %chg_to_exp(var_ce = invt); %chg_to_exp(var_ce = rect); %chg_to_exp(var_ce = gp_x); %chg_to_exp(var_ce = xsga); dsale_dinv = sale_ce - invt_ce; dsale_drec = sale_ce - rect_ce; dgp_dsale = gp_ce - sale_ce; dsale_dsga = sale_ce - xsga_ce; drop sale_ce invt_ce rect_ce gp_ce xsga_ce; * Earnings and Revenue 'Surpise'; %standardized_unexpected(var=sale_qtr, qtrs = 8, qtrs_min = 6); %standardized_unexpected(var=ni_qtr, qtrs = 8, qtrs_min = 6); * Abnormal Corporate Investment; __capex_sale = capx / sale_x; if sale_x <= 0 then __capx_sale = .; capex_abn = __capex_sale / ((lag12(__capex_sale) + lag24(__capex_sale) + lag36(__capex_sale)) / 3) - 1; if count <= 36 then capex_abn = .; drop __capex_sale; /* Profit scaled by lagged */ op_atl1 = op_x / lag12(at_x); if count <= 12 or lag12(at_x) <= 0 then op_atl1 = .; gp_atl1 = gp_x / lag12(at_x); if count <= 12 or lag12(at_x) <= 0 then gp_atl1 = .; ope_bel1 = ope_x / lag12(be_x); if count <= 12 or lag12(be_x) <= 0 then ope_bel1 = .; cop_atl1 = cop_x / lag12(at_x); if count <= 12 or lag12(at_x) <= 0 then cop_atl1 = .; /* Profitability Measures*/ pi_nix = pi_x / nix_x; if pi_x <= 0 or nix_x <= 0 then pi_nix = .; ocf_at = ocf_x / at_x; op_at = op_x / at_x; if at_x <= 0 then do; ocf_at = .; op_at = .; end; ocf_at_chg1 = ocf_at - lag12(ocf_at); if count <= 12 then ocf_at_chg1 = .; /* Book Leverage */ at_be = at_x / be_x; /* Volatility Quarterly Items */ __ocfq_saleq = ocf_qtr / sale_qtr; __niq_saleq = ni_qtr / sale_qtr; if sale_qtr <= 0 then do; __ocfq_saleq = .; __niq_saleq = .; end; __roeq = ni_qtr / be_x; if be_x <= 0 then __roeq = .; %volq(name = ocfq_saleq_std, var = __ocfq_saleq, qtrs = 16, qtrs_min = 8); %volq(name = niq_saleq_std, var = __niq_saleq, qtrs = 16, qtrs_min = 8); %volq(name = roeq_be_std, var = __roeq, qtrs = 20, qtrs_min = 12); drop __ocfq_saleq __niq_saleq __roeq; /* Volatility Annual Items*/ __roe = ni_x / be_x; if be_x <= 0 then __roe = .; %vola(name = roe_be_std, var = __roe, yrs = 5, yrs_min = 5); drop __roe; /* Asset Tangibility */ tangibility = (che + 0.715 * rect + 0.547 * invt + 0.535 * ppegt) / at_x; * Earnings Smoothness; %earnings_variability(esm_h=5); * Asset Liquidity; aliq_at = aliq_x / lag12(at_x); if count <= 12 or lag12(at_x) <= 0 then aliq_at = .; * Equity Duration Helper Variables; %equity_duration_cd(horizon=10, r=0.12, roe_mean=0.12, roe_ar1=0.57, g_mean=0.06, g_ar1=0.24); * Pitroski F-Score; %pitroski_f(name = f_score); * Ohlson (1980) O-score; %ohlson_o(name = o_score); * Altman (1968) Z-score; %altman_z(name = z_score); * Intrinsic ROE based value from Frankel and Lee (1998); %intrinsic_value(name = intrinsic_value, r=0.12); * Kaplan-Zingales Index; %kz_index(name=kz_index); * 5 year ratio change (For quality minus junk variables); %chg_var1_to_var2(name=gpoa_ch5, var1=gp_x, var2=at_x, horizon=60); %chg_var1_to_var2(name=roe_ch5, var1=ni_x, var2=be_x, horizon=60); %chg_var1_to_var2(name=roa_ch5, var1=ni_x, var2=at_x, horizon=60); %chg_var1_to_var2(name=cfoa_ch5, var1=ocf_x, var2=at_x, horizon=60); %chg_var1_to_var2(name=gmar_ch5, var1=gp_x, var2=sale_x, horizon=60); /* Delete Helper Variables */ drop count; run; /* Create earningspersistence */ %earnings_persistence(out=earnings_pers, data=__chars5, __n=5, __min=5); proc sql; create table __chars6 as select a.*, b.ni_ar1, b.ni_ivol from __chars5 as a left join earnings_pers as b on a.gvkey = b.gvkey and a.curcd=b.curcd and a.datadate=b.datadate; quit; /* Keep only dates with accounting data */ data __chars7; set __chars6; where data_available=1; run; /* Expand by Public Availability */ * Would be great to change start_date to filling_date or some derivative which was a function of fqtr; proc sort data=__chars7; by gvkey descending datadate; run; data __chars8; set __chars7; by gvkey; start_date = intnx('month', datadate, &lag_to_public.,'e'); format start_date YYMMDDN8.; next_start_date = lag(start_date); if first.gvkey then next_start_date=.; end_date = min(intnx('month', next_start_date, -1, 'e'), intnx('month', datadate, &max_data_lag., 'e')); format end_date YYMMDDN8.; drop next_start_date; run; %expand(data=__chars8, out=__chars9, id_vars=gvkey, start_date=start_date, end_date=end_date, freq='month', new_date_name=public_date); /* Convert All Raw (non-scaled) Variables to USD [2]*/ %compustat_fx(out=__fx); proc sql; create table __chars10 as select a.*, b.fx from __chars9 as a left join __fx as b on a.curcd=b.curcdd and a.public_date=b.date; quit; data __chars11; set __chars10; array var_raw assets sales book_equity net_income; do over var_raw; var_raw = var_raw*fx; end; drop curcd; run; /* Create Ratios using both Accounting and Market Values */ * Note that valuation ratios are created at the company level; proc sql; create table __me_data1 as select distinct gvkey, eom, me_company /* Include id for join with daily std */ from &me_data. where not missing(gvkey) and primary_sec=1 and not missing(me_company) and common=1 and obs_main=1 /* Notice, exch_main is not a requirement */ group by gvkey, eom having me_company=max(me_company); quit; proc sql; create table __chars12 as select a.*, b.me_company from __chars11 as a left join __me_data1 as b on a.gvkey=b.gvkey and a.public_date=b.eom; quit; proc sort data=__chars12 out=__chars13 nodupkey; by gvkey public_date; run; /*THEIS: Global-> No duplicates US-> 3464 duplicate observations where deleted*/ data __chars14; set __chars13; /* Prepare Data */ mev = me_company+netdebt_x*fx; /* Enterprise Value (in Dollars) */ mat = at_x * fx - be_x * fx + me_company; /* Market Asset Value */ if mev <= 0 then mev = .; if me_company <= 0 then me_company = .; if mat <= 0 then mat = .; /* Characteristics Scaled by Market Equity */ %let me_vars = at_x be_x debt_x netdebt_x che sale_x gp_x ebitda_x ebit_x ope_x ni_x nix_x cop_x ocf_x fcf_x div_x eqbb_x eqis_x eqpo_x eqnpo_x eqnetis_x xrd; %do i=1 %to %nwords(&me_vars.); %let var_me = %scan(&me_vars., &i, %str(' ')); %let name_me = %sysfunc(tranwrd(&var_me., _x, %str())); /* Remove '_x' from var name */ &name_me._me = (&var_me.*fx)/me_company; %end; ival_me = (intrinsic_value*fx) / me_company; /* Characteristics Scaled by Market Enterprise Value */ %let mev_vars = at_x bev_x ppent be_x che sale_x gp_x ebitda_x ebit_x ope_x ni_x nix_x cop_x ocf_x fcf_x debt_x pstk_x dltt dlc dltnetis_x dstnetis_x dbnetis_x netis_x fincf_x; %do i=1 %to %nwords(&mev_vars.); %let var_mev = %scan(&mev_vars., &i, %str(' ')); %let name_mev = %sysfunc(tranwrd(&var_mev., _x, %str())); /* Remove '_x' from var name */ &name_mev._mev = (&var_mev.*fx)/mev; %end; /* Characteristics Scaled by Market Assets */ aliq_mat = aliq_x * fx / lag12(mat); if gvkey ^= lag12(gvkey) then aliq_mat = .; /* Size Measure */ enterprise_value = mev; /* Equity Duration */ eq_dur = ed_cd_w * fx / me_company + ed_constant * (me_company - ed_cd * fx) / me_company; if ed_err = 1 or eq_dur <= 0 then eq_dur = .; run; /* Format Output */ proc sql noprint; select name into :col_names separated by ' ' from dictionary.columns where libname=upcase("work") and memname = upcase("__chars14"); quit; data __chars15; set __chars14; %do i=1 %to %nwords(&col_names.); %let old_name = %scan(&col_names., &i, %str(' ')); %let new_name = %sysfunc(prxchange(s/xrd/rd/, 1, &old_name.)); /* Replace XRD with RD */ %let new_name = %sysfunc(prxchange(s/xsga/sga/, 1, &new_name.)); /* Replace XSGA with SGA */ %let new_name = %sysfunc(prxchange(s/dlc/debtst/, 1, &new_name.)); /* Replace DLC with DEBTST */ %let new_name = %sysfunc(prxchange(s/dltt/debtlt/, 1, &new_name.)); /* Replace DLTT with DEBTLT */ %let new_name = %sysfunc(prxchange(s/oancf/ocf/, 1, &new_name.)); /* Replace OANCF with OCF */ %let new_name = %sysfunc(prxchange(s/ppegt/ppeg/, 1, &new_name.)); /* Replace PPEGT with PPEG */ %let new_name = %sysfunc(prxchange(s/ppent/ppen/, 1, &new_name.)); /* Replace PPENT with PPEN */ %let new_name = %sysfunc(prxchange(s/che/cash/, 1, &new_name.)); /* Replace CHE with CASH */ %let new_name = %sysfunc(prxchange(s/invt/inv/, 1, &new_name.)); /* Replace INVT with INV */ %let new_name = %sysfunc(prxchange(s/rect/rec/, 1, &new_name.)); /* Replace RECT with REC */ %let new_name = %sysfunc(prxchange(s/txt/tax/, 1, &new_name.)); /* Replace TXT with TAX */ %let new_name = %sysfunc(prxchange(s/ivao/lti/, 1, &new_name.)); /* Replace IVAO with LTI */ %let new_name = %sysfunc(prxchange(s/ivst/sti/, 1, &new_name.)); /* Replace IVST with STI */ %let new_name = %sysfunc(prxchange(s/sale_qtr/saleq/, 1, &new_name.)); /* Replace SALE_QTR with SALEQ */ %let new_name = %sysfunc(prxchange(s/ni_qtr/niq/, 1, &new_name.)); /* Replace SALE_QTR with SALEQ */ %let new_name = %sysfunc(prxchange(s/ocf_qtr/ocfq/, 1, &new_name.)); /* Replace SALE_QTR with SALEQ */ rename &old_name. = &new_name.; %end; run; * Reorder and Keep only Selected Columns; data __chars16; retain source gvkey datadate public_date assets sales book_equity net_income enterprise_value; set __chars15; keep source gvkey public_date datadate &__keep_vars.; run; * Add suffix if specified; %if %length(&suffix.)>0 %then %do; data __chars16; set __chars16; %do i=1 %to %nwords(&__keep_vars.); %let var_x = %scan(&__keep_vars., &i, %str(' ')); rename &var_x. = &var_x.&suffix.; %end; rename datadate=datadate&suffix.; run; %end; proc sort nodupkey data=__chars16 out=&out.; by gvkey public_date; run; proc delete data= __chars3 __chars4 __chars5 __chars6 __chars7 __chars8 __chars9 __chars10 __chars11 __chars12 __chars13 __chars14 __chars15 __chars16 __me_data __me_data1 __fx earnings_pers; run; %mend create_acc_chars; /* Combine Characteristics from Annual and Quarterly Data */ %macro combine_ann_qtr_chars(out=, ann_data=, qtr_data=, __char_vars=, q_suffix=); proc sql; create table __acc_chars1 as select a.*, b.* from &ann_data. as a left join &qtr_data. as b on a.gvkey=b.gvkey and a.public_date=b.public_date; quit; /* Substitute Annual Characteristic for Quarterly if Quarterly is more recent */ data __acc_chars2; set __acc_chars1; %do i=1 %to %nwords(&__char_vars.); %let ann_var = %scan(&__char_vars., &i.); %let qtr_var = &ann_var.&q_suffix.; if missing(&ann_var.) or (not missing(&qtr_var.) and datadate&q_suffix. > datadate) then /* Didn't include the first part before! */ &ann_var. = &qtr_var.; drop &qtr_var.; %end; drop datadate datadate&q_suffix.; /* We can no longer be sure which items accounting dates refer to */ run; proc sort nodupkey data=__acc_chars2 out=&out; by gvkey public_date; run; proc delete data=__acc_chars1 __acc_chars2; run; %mend combine_ann_qtr_chars; ================================================ FILE: GlobalFactors/char_macros.sas ================================================ /* MACROS USING COMPOSITE DATA */ /* MACRO: MISPRICING_FACTORS - Based on the paper by Yuan and Stambaugh (2016) - Currently, the distress probability anomaly is not implemented. - I use fractional ranks i.e. ranks from 0 to 1. I think they use absolute ranks in the paper. */ %macro mispricing_factors(out=, data=, min_stks=, min_fcts=3); proc sql; create table chars1 as select id, eom, excntry, chcsho_12m, eqnpo_12m, oaccruals_at, noa_at, at_gr1, ppeinv_gr1a, o_score, ret_12_1, gp_at, niq_at from &data. where common=1 and primary_sec=1 and obs_main=1 and exch_main = 1 and not missing(ret_exc) and not missing(me) order by excntry, eom; quit; %let __vars = chcsho_12m eqnpo_12m oaccruals_at noa_at at_gr1 ppeinv_gr1a o_score ret_12_1 gp_at niq_at; %let __direction = -1 1 -1 -1 -1 -1 -1 1 1 1; %do i=1 %to 10; %let __v = %scan(&__vars., &i, %str(' ')); %let __d = %scan(&__direction., &i, %str(' ')); %if &__d. = 1 %then %do; %let __sort=; %end; %else %do; %let __sort=descending; %end; proc sql; create table __subset as select * from chars1 group by excntry, eom having count(&__v.) >= &min_stks.; quit; proc rank data=__subset out = __ranks(keep=excntry id eom rank_&__v.) &__sort. ties=mean f; by excntry eom; var &__v.; ranks rank_&__v.; run; proc sql; create table chars%eval(&i.+1) as select a.*, b.rank_&__v. from chars&i. as a left join __ranks as b on a.id=b.id and a.eom=b.eom; quit; %end; data &out.; set chars11; mispricing_perf = mean(rank_o_score, rank_ret_12_1, rank_gp_at, rank_niq_at); if missing(rank_o_score) + missing(rank_ret_12_1) + missing(rank_gp_at) + missing(rank_niq_at) > &min_fcts. then mispricing_perf = .; mispricing_mgmt = mean(rank_chcsho_12m, rank_eqnpo_12m, rank_oaccruals_at, rank_noa_at, rank_at_gr1, rank_ppeinv_gr1a); if missing(rank_chcsho_12m) + missing(rank_eqnpo_12m) + missing(rank_oaccruals_at) + missing(rank_noa_at) + missing(rank_at_gr1) + missing(rank_ppeinv_gr1a) > &min_fcts. then mispricing_mgmt = .; keep id eom mispricing_perf mispricing_mgmt; run; %mend; * MACRO: QUALITY MINUS JUNK - Based on the paper by Asness, Frazzini and Pedersen (2018) - I deviate slightly from the original paper in the variable construction - The most clear deviation is the way the growth variables are created. ; %macro quality_minus_junk(out=, data=, min_stks=); /* Helper Macro */ %macro z_ranks(out=, data=, var=, min=, sort=); proc sql; create table __subset as select * from &data. group by excntry, eom having count(&var.) >= &min.; quit; proc rank data=__subset out = __ranks(keep=excntry id eom rank_&var.) &sort. ties=mean; by excntry eom; var &var.; ranks rank_&var.; run; proc sql; create table &out. as select excntry, id, eom, (rank_&var. - mean(rank_&var.)) / std(rank_&var.) as z_&var. from __ranks where not missing(rank_&var.) group by excntry, eom; quit; proc delete data=__subset __ranks; run; %mend; proc sql; create table qmj1 as select id, eom, excntry, coalesce(roeq_be_std*2, roe_be_std) as __evol, /* I multiply the quarterly measure by sqrt(4)=2 to reflect that quarterly measures are less volatile than the annual measure. Empirically, this seems to be a reasonable approximation although perhaps a slightly higher multiplied could be used e.g. 2.5*/ gp_at, ni_be, ni_at, ocf_at, gp_sale, oaccruals_at, gpoa_ch5, roe_ch5, roa_ch5, cfoa_ch5, gmar_ch5, betabab_1260d, debt_at, o_score, z_score from &data. where common=1 and primary_sec=1 and obs_main=1 and exch_main=1 and not missing(ret_exc) and not missing(me) order by excntry, eom; quit; %let z_vars = gp_at ni_be ni_at ocf_at gp_sale oaccruals_at gpoa_ch5 roe_ch5 roa_ch5 cfoa_ch5 gmar_ch5 betabab_1260d debt_at o_score z_score __evol; %let direction = 1 1 1 1 1 -1 1 1 1 1 1 -1 -1 -1 1 -1; %do i=1 %to 16; %let __v = %scan(&z_vars., &i, %str(' ')); %let __d = %scan(&direction., &i, %str(' ')); %if &__d. = 1 %then %do; %let __sort=; %end; %else %do; %let __sort=descending; %end; %z_ranks(out=__z, data=qmj1, var = &__v., min=&min_stks., sort=&__sort.); proc sql; create table qmj%eval(&i.+1) as select a.*, b.z_&__v. from qmj&i. as a left join __z as b on a.id=b.id and a.eom=b.eom; quit; %if &i.>1 %then %do; proc delete data=qmj&i.; run; %end; %end; data qmj18; set qmj17; __prof = mean(z_gp_at, z_ni_be, z_ni_at, z_ocf_at, z_gp_sale, z_oaccruals_at); __growth = mean(z_gpoa_ch5, z_roe_ch5, z_roa_ch5, z_cfoa_ch5, z_gmar_ch5); __safety = mean(z_betabab_1260d, z_debt_at, z_o_score, z_z_score, z___evol); keep excntry id eom __prof __growth __safety; run; %z_ranks(out=__prof, data=qmj18, var = __prof, min=&min_stks., sort=); %z_ranks(out=__growth, data=qmj18, var = __growth, min=&min_stks., sort=); %z_ranks(out=__safety, data=qmj18, var = __safety, min=&min_stks., sort=); proc sql; create table qmj19 as select a.excntry, a.id, a.eom, b.z___prof as qmj_prof, c.z___growth as qmj_growth, d.z___safety as qmj_safety from qmj18 as a left join __prof as b on a.excntry=b.excntry and a.id=b.id and a.eom=b.eom left join __growth as c on a.excntry=c.excntry and a.id=c.id and a.eom=c.eom left join __safety as d on a.excntry=d.excntry and a.id=d.id and a.eom=d.eom; quit; /* QMJ SCORE! */ data qmj20; set qmj19; __qmj = (qmj_prof + qmj_growth + qmj_safety) / 3; * Missing if any of subcomponents are missing; run; %z_ranks(out=__qmj, data=qmj20, var=__qmj, min = &min_stks., sort=); proc sql; create table &out. as select a.excntry, a.id, a.eom, a.qmj_prof, a.qmj_growth, a.qmj_safety, b.z___qmj as qmj from qmj20 as a left join __qmj as b on a.excntry=b.excntry and a.id=b.id and a.eom=b.eom; quit; proc delete data=qmj1 qmj17 qmj18 qmj19 qmj20; run; %mend; /* MACRO USING RETURN DATA -------------------------------------------------------*/ * MACRO: BIDASK_HL ------------------------- - Corwin-Schultz High-Low Bid-ask Estimator - Heavily inspired by Shane Corwins code: http://sites.nd.edu/scorwin/files/2019/12/Sample-SAS-Program.pdf - Primary change: I adjust prices for stock splits - Arguments: * OUT: Output dataset containing estimates of average monthly bid-ask spread and return volatility * DATA: Input dataset with high and low prices * __min_obs: Minimum amount of daily observations required to compute monthly estimates; %macro bidask_hl(out=, data=, __min_obs=); proc sql; create table __dsf1 as select a.id, a.date, a.eom, a.bidask, a.tvol, a.prc / a.adjfct as prc, a.prc_high / a.adjfct as prc_high, a.prc_low / a.adjfct as prc_low /* Adjust price for stocks splits! */ from &data. as a left join scratch.market_returns_daily as b on a.excntry=b.excntry and a.date=b.date where not missing(b.mkt_vw_exc) /* This ensures that we look at trading days */ order by id, date; quit; * Cleaning data; data __dsf2(drop=prc_low_r prc_high_r); retain prc_low_r prc_high_r; set __dsf1; by id date eom; * Keep initial valeus; prc_low_in = prc_low; prc_high_in = prc_high; hlreset = 0; * Initial Screens; if bidask = 1 or prc_low=prc_high or prc_low<=0 or prc_high<=0 or tvol=0 then do; prc_high = .; prc_low = .; end; /* Replace bad/missing price with previous day range */ if first.id then do; prc_low_r = .; prc_high_r = .; end; * Reset retained high and low volume; if 0 prc_high_r then do; prc_low = prc_low_r + (prc - prc_high_r); prc_high = prc; hlreset = 3; end; end; /* Final data screen after H/L reset */ if prc_low ^= 0 and prc_high/prc_low > 8 then do; prc_low = .; prc_high = .; end; run; /* Adjust for overnight returns */ data __dsf3; set __dsf2; retadj = 0; prc_low_t = prc_low; prc_high_t = prc_high; prc_low_l1 = lag(prc_low); prc_high_l1 = lag(prc_high); prc_l1 = lag(prc); if id ^= lag(id) then do; prc_low_l1 = .; prc_high_l1 = .; prc_l1 = .; end; if prc_l10 then do; * Adjust when prior close is below current low; prc_high_t=prc_high-(prc_low-prc_l1); prc_low_t=prc_l1; retadj=1; end; if prc_l1>prc_high and prc_l1>0 then do; * Adjust when prior close is above current high; prc_high_t=prc_l1; prc_low_t=prc_low+(prc_l1-prc_high); retadj=2; end; run; /* Calculate daily high/low bid-ask spread*/ data __dsf4; set __dsf3; pi=constant('PI'); k2 = sqrt(8/pi); const = 3-2*sqrt(2); prc_high_2d=max(prc_high_t,prc_high_l1); prc_low_2d=min(prc_low_t,prc_low_l1); if prc_low_t>0 and prc_low_l1>0 then beta = (log(prc_high_t/prc_low_t))**2+(log(prc_high_l1/prc_low_l1))**2; if prc_low_2d>0 then gamma = (log(prc_high_2d/prc_low_2d))**2; alpha = (sqrt(2*beta)-sqrt(beta))/const - sqrt(gamma/const); * Calculate spread with missing set to zero; spread = 2*(exp(alpha)-1)/(1+exp(alpha)); spread_0 = max(spread,0); * Set negative spread estimates to zero; if spread = . then spread_0 = .; * Calculate daily volatillity; sigma = ((sqrt(beta/2)-sqrt(beta)))/(k2*const)+sqrt(gamma/(k2*k2*const)); sigma_0 = max(sigma,0); * Set negative sigma estimates to zero; if sigma= . then sigma_0 = .; run; /* Monthly bid-ask estimates */ proc sql; create table &out. as select id, eom, mean(spread_0) as bidaskhl_21d, mean(sigma_0) as rvolhl_21d from __dsf4 group by id, eom having count(spread_0) > &__min_obs.; quit; proc delete data=__dsf1 __dsf2 __dsf3 __dsf4; run; %mend; * MACRO: SEASONALITY - Caclulates annual and non-annual seasonality measures following Heston and Sadka (2008) - Specifically, calculate the average return over annual and non-annual lags within the specified start and end dates - Within a given year, the annual lag is lag11 and the non-annual lags are lag0-lag10. - For return predictability, the information should be used to form portfolios at the end of lag0 - Said differently, the seasonality variables should be lagged 1 period relative to returns ; %macro seasonality(start_year=, end_year=); * Return over all lags; __all_ret = 0; __all_n = 0; %do i = %eval((&start_year.-1) * 12) %to %eval(&end_year. * 12 - 1); __all_ret = __all_ret + lag&i(ret_x); __all_n = __all_n + 1; %end; * Return over annual lags; __an_ret = 0; __an_n = 0; %do i = %eval(&start_year.) %to &end_year.; %let __al = %eval(&i. * 12 - 1); __an_ret = __an_ret + lag&__al.(ret_x); __an_n = __an_n + 1; %end; * Return over non-annual lags; __na_ret = __all_ret - __an_ret; __na_n = __all_n - __an_n; * Create Variables; seas_&start_year._&end_year.an = __an_ret / __an_n; seas_&start_year._&end_year.na = __na_ret / __na_n; if count < %eval(&end_year. * 12) then do; seas_&start_year._&end_year.an = .; seas_&start_year._&end_year.na = .; end; drop __all_ret __all_n __an_ret __an_n __na_ret __na_n; %mend; /* MACRO USING ACCOUNTING DATA -------------------------------------------------------*/ * Create Growth in Variable over horizon; %macro var_growth(var_gr=, horizon=); /* Horizon is in months */ %let name_gr = %sysfunc(tranwrd(&var_gr., _x, %str())); /* Remove '_x' from var name */ %let name_gr = &name_gr._gr%sysevalf(&horizon./12); /* Add gr and horizon in years to name */ &name_gr. = &var_gr./lag&horizon.(&var_gr.)-1; if count<=&horizon. or lag&horizon.(&var_gr.)<=0 then &name_gr. = .; %mend; * Change in Variable over Horizon Scaled by Assets; %macro chg_to_assets(var_gra=, horizon=); /* Horizon is in months */ %let name_gra = %sysfunc(tranwrd(&var_gra., _x, %str())); /* Remove '_x' from var name */ %let name_gra = &name_gra._gr%sysevalf(&horizon./12); /* Add gr and horizon in years to name */ %let name_gra = &name_gra.a; /* Add 'a' in the end*/ &name_gra. = (&var_gra.-lag&horizon.(&var_gra.))/at_x; if count<=&horizon. or at_x<=0 then &name_gra. = .; %mend; * Ratio Change; %macro chg_var1_to_var2(name=, var1=, var2=, horizon=); __x = &var1. / &var2.; if &var2. <= 0 then __x=.; &name. = (__x - lag&horizon.(__x)); if count <= horizon then &name. = .; drop __x; %mend; * Change to expectations (Abarnell and Bushee, 1998); %macro chg_to_exp(var_ce=); %let name_ce = %sysfunc(tranwrd(&var_ce., _x, %str())); /* Remove '_x' from var name */ %let name_ce = &name_ce._ce; __expect = (lag12(&var_ce.) + lag24(&var_ce.))/2; &name_ce. = &var_ce. / (__expect) - 1; if count <= 24 or __expect <= 0 then &name_ce. = .; drop __expect; %mend; * Standardized Unexpected Realization; * Uses the specification in Jegadeesh and Livnat (2006); %macro standardized_unexpected(var=, qtrs=, qtrs_min=); %let name = %sysfunc(tranwrd(&var., _x, %str())); /* Remove '_x' from var name */ %let name = &name._su; __chg = &var. - lag12(&var.); __chg_mean = %apply_to_lastq(x = __chg, _qtrs = &qtrs., func = mean); __chg_std = %apply_to_lastq(x = __chg, _qtrs = &qtrs., func = std); __chg_n = %apply_to_lastq(x = not missing(__chg), _qtrs = &qtrs., func = sum); if __chg_n <= &qtrs_min. then do; __chg_mean = .; __chg_std = .; end; &name. = (&var. - (lag12(&var.) + lag3(__chg_mean) )) / lag3(__chg_std); /* This is the correct one*/ if count <= %eval(12 + &qtrs.*3) then &name. = .; drop __chg __chg_mean __chg_std __chg_n; %mend; * Volatility of Quarterly Data; %macro volq(name=, var=, qtrs=, qtrs_min=); __n = %apply_to_lastq(x = not missing(&var.), _qtrs = &qtrs., func = sum); &name. = %apply_to_lastq(x = &var., _qtrs = &qtrs., func = std); if count <= %eval((&qtrs.-1)*3) or __n < &qtrs_min. then &name. = .; drop __n; %mend; * Volatility of Annual Data; %macro vola(name=, var=, yrs=, yrs_min=); __n = %apply_to_lasty(x = not missing(&var.), yrs = &yrs., func = sum); &name. = %apply_to_lasty(x = &var., yrs = &yrs., func = std); if count <= %eval((&yrs.-1)*12) or __n < &yrs_min. then &name. = .; drop __n; %mend; * Earnings Smoothness; %macro earnings_variability(esm_h=); __roa = ni_x / lag12(at_x); __croa = ocf_x / lag12(at_x); __roa_n = %apply_to_lasty(x= not missing(__roa), yrs=&esm_h., func=sum); __croa_n = %apply_to_lasty(x= not missing(__croa), yrs=&esm_h., func=sum); __roa_std = %apply_to_lasty(x=__roa, yrs=&esm_h., func=std); __croa_std = %apply_to_lasty(x=__croa, yrs=&esm_h., func=std); earnings_variability = __roa_std / __croa_std; if count <= %eval(&esm_h. * 12) or __croa_std <= 0 or __roa_n < &esm_h. or __croa_n < &esm_h. then earnings_variability = .; drop __roa __croa __roa_n __croa_n __roa_std __croa_std; %mend; /* Equity Duration: Forecast of Cash Distribution */ %macro equity_duration_cd(horizon=, r=, roe_mean=, roe_ar1=, g_mean=, g_ar1=); * Create Initial Variables; __roe0 = ni_x / lag12(be_x); __g0 = sale_x / lag12(sale_x) - 1; __be0 = be_x; if count <= 12 or lag12(be_x) <= 1 then __roe0 = .; /* Use 1 million to avoid bad estimates from a small denominator */ if count <= 12 or lag12(sale_x) <= 1 then __g0 = .; /* Use 1 million to avoid bad estimates from a small denominator */ * Forecast Cash Distributions; %let roe_c = &roe_mean.*(1 - &roe_ar1.); %let g_c = &g_mean.*(1 - &g_ar1.); %do i = 1 %to &horizon.; %let j = %eval(&i.-1); __roe&i. = &roe_c. + &roe_ar1. * __roe&j.; __g&i. = &g_c. + &g_ar1. * __g&j.; __be&i. = __be&j. * (1 + __g&i.); __cd&i. = __be&j. * (__roe&i. - __g&i.); %end; * Create Duration Helper Variables; ed_constant = &horizon. + (1 + &r.) / &r.; ed_cd_w = 0; ed_cd = 0; ed_err = 0; %do t = 1 %to &horizon.; ed_cd_w = ed_cd_w + &t. * __cd&t. / (1 + &r.)**&t.; ed_cd = ed_cd + __cd&t. / (1 + &r.)**&t.; if __be&t. < 0 then ed_err = 1; %end; drop __roe: __g: __be: __cd:; %mend; * Pitroski (2000) Fundamental Score; %macro pitroski_f(name=); __f_roa = ni_x / lag12(at_x); if count <= 12 or lag12(at_x) <= 0 then __f_roa = .; __f_croa = ocf_x / lag12(at_x); if count <= 12 or lag12(at_x) <= 0 then __f_croa = .; __f_droa = __f_roa - lag12(__f_roa); if count <= 12 then __f_droa = .; __f_acc = __f_croa - __f_roa; __f_lev = dltt / at_x - lag12(dltt / at_x); if count <= 12 or at_x <= 0 or lag12(at_x) <= 0 then __f_lev = .; __f_liq = ca_x / cl_x - lag12(ca_x / cl_x); if count <= 12 or cl_x <= 0 or lag12(cl_x) <= 0 then __f_liq = .; __f_eqis = eqis_x; __f_gm = gp_x / sale_x - lag12(gp_x / sale_x); if count <= 12 or sale_x <= 0 or lag12(sale_x) <= 0 then __f_gm = .; __f_aturn = sale_x / lag12(at_x) - lag12(sale_x) / lag24(at_x); if count <= 24 or lag12(at_x) <= 0 or lag24(at_x) <= 0 then __f_aturn = .; &name. = (__f_roa > 0) + (__f_croa > 0) + (__f_droa > 0) + (__f_acc > 0) + (__f_lev < 0) + (__f_liq > 0) + (coalesce(__f_eqis, 0) = 0) + /* Set __f_eqis to zero if missing. This greatly expands coverage and seems like a reasonable approximation */ (__f_gm > 0) + (__f_aturn > 0); * Only allow __f_eqis to be missing; if missing(__f_roa) or missing(__f_croa) or missing(__f_droa) or missing(__f_acc) or missing(__f_lev) or missing(__f_liq) or missing(__f_gm) or missing(__f_aturn) then &name. = .; drop __f_:; %mend; * Ohlson (1980) O-score; %macro ohlson_o(name=); * Create Helpers; __o_lat = log(at_x); __o_lev = debt_x / at_x; __o_wc = (ca_x - cl_x) / at_x; __o_roe = nix_x / at_x; if at_x <= 0 then do; __o_lat = .; __o_lev = .; __o_wc = .; __o_roe = .; end; __o_cacl = cl_x / ca_x; if ca_x <= 0 then __o_cacl = .; __o_ffo = (pi_x + dp) / lt; if lt <= 0 then __o_ffo = .; __o_neg_eq = lt > at_x; if missing(lt) or missing(at_x) then __o_neg_eq = .; __o_neg_earn = (nix_x < 0 and lag12(nix_x) < 0); if count <= 12 or missing(nix_x) or missing(lag12(nix_x)) then __o_neg_earn = .; __o_nich = (nix_x - lag12(nix_x)) / (abs(nix_x) + abs(lag12(nix_x))); if count <= 12 or (abs(nix_x) + abs(lag12(nix_x))) = 0 then __o_nich = .; * Create O-score; &name. = -1.32 - 0.407 * __o_lat + 6.03 * __o_lev - 1.43 * __o_wc + 0.076 * __o_cacl - 1.72 * __o_neg_eq - 2.37 * __o_roe - 1.83 * __o_ffo + 0.285 * __o_neg_earn - 0.52 * __o_nich; %mend; * Altman (1968) Z-score; %macro altman_z(name=); * Create Helpers; __z_wc = (ca_x - cl_x) / at_x; __z_re = re / at_x; __z_eb = ebitda_x / at_x; __z_sa = sale_x / at_x; if at_x <= 0 then do; __z_wc = .; __z_re = .; __z_eb = .; __z_sa = .; end; __z_me = me_fiscal / lt; if lt <= 0 then __z_me = .; * Create Temporary Z-score; &name. = 1.2 * __z_wc + 1.4 * __z_re + 3.3 * __z_eb + 0.6 * __z_me + 1.0 * __z_sa; drop __z: %mend; * Intrinsic ROE based value from Frankel and Lee (1998); %macro intrinsic_value(name=, r=); __iv_po = div_x/nix_x; if nix_x <= 0 then __iv_po = div_x / (at_x * 0.06); __iv_roe = nix_x / ((be_x + lag12(be_x)) / 2); if count <= 12 or (be_x + lag12(be_x)) <= 0 then __iv_roe = .; __iv_be1 = (1 + (1 - __iv_po) * __iv_roe) * be_x; &name. = be_x + (__iv_roe - &r.) / (1 + &r.) * be_x + (__iv_roe - &r.) / ((1 + &r.) * &r.) * __iv_be1; * If Intrinsic value is Non-Positive, set to missing; if &name. <= 0 then &name. = .; drop __iv:; %mend; * Kaplan-Zingales Index; %macro kz_index(name=); * Create Helper Variables; __kz_cf = (ni_x + dp) / lag12(ppent); if count <= 12 or lag12(ppent)<=0 then __kz_cf = .; __kz_q = (at_x + me_fiscal - be_x) / at_x; if at_x <= 0 then __kz_q = .; __kz_db = debt_x / (debt_x + seq_x); if (debt_x + seq_x) = 0 then __kz_db = .; __kz_dv = div_x / lag12(ppent); if count <= 12 or lag12(ppent)<=0 then __kz_dv = .; __kz_cs = che / lag12(ppent); if count <= 12 or lag12(ppent)<=0 then __kz_cs = .; * Create Variable; &name. = - 1.002 * __kz_cf + 0.283 * __kz_q + 3.139 * __kz_db - 39.368 * __kz_dv - 1.315 * __kz_cs; %mend; /* Earnings Predicability/Persistence*/ * I scale net income by total assets to account for issuance activity.; %macro earnings_persistence(out=, data=, __n=, __min=); %let __months = %eval(&__n. * 12); proc sort data=&data. out=__acc1; by gvkey curcd datadate; run; data __acc2; set __acc1; by gvkey curcd; retain count; if first.curcd then count = 1; else count = count+1; run; data __acc3; set __acc2; __ni_at = ni_x / at_x; if at_x <= 0 then __ni_at = .; __ni_at_l1 = lag12(__ni_at); if count<=12 then __ni_at_l1 =.; run; proc sql; create table __acc4 as select gvkey, curcd, datadate, __ni_at, __ni_at_l1 from __acc3 where not missing(__ni_at) and not missing(__ni_at_l1); quit; proc sql; create table month_ends as select distinct datadate from __acc4 order by datadate; quit; * Divide data into __n groups; proc sql; create table dates_apply as select *, mod(monotonic(), &__months.) as grp from month_ends; quit; * Helper macro: If first group, save &new. as &base. otherwise, append &new. to &base.; %macro save_or_append(base=, new=); %if &__grp. = 0 %then %do; data &base.; set &new.; run; %end; %else %do; proc append base=&base. data=&new.; run; %end; %mend; %do __grp=0 %to %eval(&__months. - 1); %put ############### GROUP %eval(&__grp.+1) out of &__months. ###############; * Prepare data; proc sql; create table calc_dates as select a.datadate, b.datadate as calc_date from dates_apply as a left join dates_apply(where=(grp = &__grp.)) as b on a.datadate > intnx("year", b.datadate, -&__n., "e") and a.datadate <= b.datadate and month(a.datadate) = month(b.datadate); /* month(*) ensures annual lags*/ quit; proc sql; create table calc_data as select a.*, b.calc_date from __acc4 as a left join calc_dates as b on a.datadate = b.datadate where not missing(b.calc_date) group by a.gvkey, a.curcd, b.calc_date having count(*) >= &__min. order by a.gvkey, b.calc_date; quit; proc reg data=calc_data outest=__earn_pers1 edf NOPRINT; by gvkey curcd calc_date; model __ni_at=__ni_at_l1; run; proc sql; create table __earn_pers2 as select gvkey, curcd, calc_date as datadate, __ni_at_l1 as ni_ar1, sqrt(_rmse_**2 * _edf_ / (_edf_ + 1)) as ni_ivol from __earn_pers1 where (_edf_ + 2) >= &__min.; quit; %save_or_append(base=op_ep, new=__earn_pers2); %end; proc sort data=op_ep out=&out. nodup; by gvkey curcd datadate; run; proc delete data= __acc1 __acc2 __acc3 __acc4 dates_apply calc_dates calc_data month_ends __earn_pers1 __earn_pers2 op_ep; run; %mend; /* MACRO - FIRM AGE */ %macro firm_age(data=, out=); * CRSP first observation; proc sql; create table crsp_age1 as select permco, min(date) as crsp_first format=YYMMDDN8. from crsp.msf group by permco; quit; * Compustat accounting first observation; proc sql; create table comp_acc_age1 as select gvkey, datadate from comp.funda outer union corr select gvkey, datadate from comp.g_funda; create table comp_acc_age2 as select gvkey, min(datadate) as comp_acc_first format=YYMMDDN8. from comp_acc_age1 group by gvkey; update comp_acc_age2 set comp_acc_first = intnx('year', comp_acc_first, -1, 'e'); /* When submitting an annual report, the firm must have existed for at least 1 year*/ quit; * Compustat return first obs; proc sql; create table comp_ret_age1 as select gvkey, datadate from comp.secm outer union corr select gvkey, datadate from comp.g_secd where monthend=1; create table comp_ret_age2 as select gvkey, min(datadate) as comp_ret_first format=YYMMDDN8. from comp_ret_age1 group by gvkey; update comp_ret_age2 set comp_ret_first = intnx('year', comp_ret_first, -1, 'e'); /* When submitting an annual report, the firm must have existed for at least 1 year*/ quit; * Add to Dataset; proc sql; create table comb1 as select a.id, a.eom, min(b.crsp_first, c.comp_acc_first, d.comp_ret_first) as first_obs format=YYMMDDN8. from &data. as a left join crsp_age1 as b on a.permco=b.permco left join comp_acc_age2 as c on a.gvkey=c.gvkey left join comp_ret_age2 as d on a.gvkey=d.gvkey; create table comb2 as select *, min(eom) as first_alt format=YYMMDDN8. /* A few (0.3% of all obs) north american observations don't have observations in comp.secm so this is a fall back option*/ from comb1 group by id; create table comb3 as select *, intck ('month', min(first_obs, first_alt), eom) as age from comb2; alter table comb3 drop first_obs, first_alt; quit; * Output; proc sort data=comb3 out=&out.; by id eom; run; %mend; ================================================ FILE: GlobalFactors/ind_identification.sas ================================================ * MACRO: FF_IND_CLASS Add variable matching 4-digit SIC identifiers to Fama-French industry identifiers Arguments: data: name of input dataset that includes 4-digit SIC codes under name 'sic' ff_grps: number of industry portfolios for Fama-French identifiers OUT: name of output dataset; %macro ff_ind_class(data=, ff_grps=, out=); %if &ff_grps = 38 %then %do; proc sql; /* French identifies "Other" as "almost nothing", so no firms are identified as "other" https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/det_38_ind_port.html*/ create table &out. as select *, case when 100 <= sic <= 999 then 1 when 1000 <= sic <= 1299 then 2 when 1300 <= sic <= 1399 then 3 when 1400 <= sic <= 1499 then 4 when 1500 <= sic <= 1799 then 5 when 2000 <= sic <= 2099 then 6 when 2100 <= sic <= 2199 then 7 when 2200 <= sic <= 2299 then 8 when 2300 <= sic <= 2399 then 9 when 2400 <= sic <= 2499 then 10 when 2500 <= sic <= 2599 then 11 when 2600 <= sic <= 2661 then 12 when 2700 <= sic <= 2799 then 13 when 2800 <= sic <= 2899 then 14 when 2900 <= sic <= 2999 then 15 when 3000 <= sic <= 3099 then 16 when 3100 <= sic <= 3199 then 17 when 3200 <= sic <= 3299 then 18 when 3300 <= sic <= 3399 then 19 when 3400 <= sic <= 3499 then 20 when 3500 <= sic <= 3599 then 21 when 3600 <= sic <= 3699 then 22 when 3700 <= sic <= 3799 then 23 when 3800 <= sic <= 3879 then 24 when 3900 <= sic <= 3999 then 25 when 4000 <= sic <= 4799 then 26 when 4800 <= sic <= 4829 then 27 when 4830 <= sic <= 4899 then 28 when 4900 <= sic <= 4949 then 29 when 4950 <= sic <= 4959 then 30 when 4960 <= sic <= 4969 then 31 when 4970 <= sic <= 4979 then 32 when 5000 <= sic <= 5199 then 33 when 5200 <= sic <= 5999 then 34 when 6000 <= sic <= 6999 then 35 when 7000 <= sic <= 8999 then 36 when 9000 <= sic <= 9999 then 37 else . end as ff38 from &data.; run; %end; %else %if &ff_grps. = 49 %then %do; proc sql; create table &out. as select *, case when sic = 2048 or 100 <= sic <= 299 or 700 <= sic <= 799 or 910 <= sic <= 919 then 1 when sic in (2095, 2098, 2099) or 2000 <= sic <= 2046 or 2050 <= sic <= 2063 or 2070 <= sic <= 2079 or 2090 <= sic <= 2092 then 2 when sic in (2086, 2087, 2096, 2097) or 2064 <= sic <= 2068 then 3 when sic = 2080 or 2082 <= sic <= 2085 then 4 when 2100 <= sic <= 2199 then 5 when sic in (3732, 3930, 3931) or 920 <= sic <= 999 or 3650 <= sic <= 3652 or 3940 <= sic <= 3949 then 6 when sic in (7840, 7841, 7900, 7910, 7911, 7980) or 7800 <= sic <= 7833 or 7920 <= sic <= 7933 or 7940 <= sic <= 7949 or 7990 <= sic <= 7999 then 7 when sic in (2770, 2771) or 2700 <= sic <= 2749 or 2780 <= sic <= 2799 then 8 when sic in (2047, 2391, 2392, 3160, 3161, 3229, 3260, 3262, 3263, 3269, 3230, 3231, 3750, 3751, 3800, 3860, 3861, 3910, 3911, 3914, 3915, 3991, 3995) or 2510 <= sic <= 2519 or 2590 <= sic <= 2599 or 2840 <= sic <= 2844 or 3170 <= sic <= 3172 or 3190 <= sic <= 3199 or 3630 <= sic <= 3639 or 3870 <= sic <= 3873 or 3960 <= sic <= 3962 then 9 when sic in (3020, 3021, 3130, 3131, 3150, 3151) or 2300 <= sic <= 2390 or 3100 <= sic <= 3111 or 3140 <= sic <= 3149 or 3963 <= sic <= 3965 then 10 when 8000 <= sic <= 8099 then 11 when sic in (3693, 3850, 3851) or 3840 <= sic <= 3849 then 12 when sic in (2830, 2831) or 2833 <= sic <= 2836 then 13 when 2800 <= sic <= 2829 or 2850 <= sic <= 2879 or 2890 <= sic <= 2899 then 14 when sic in (3031, 3041) or 3050 <= sic <= 3053 or 3060 <= sic <= 3099 then 15 when 2200 <= sic <= 2284 or 2290 <= sic <= 2295 or 2297 <= sic <= 2299 or 2393 <= sic <= 2395 or 2397 <= sic <= 2399 then 16 when sic in (2660, 2661, 3200, 3210, 3211, 3240, 3241, 3261, 3264, 3280, 3281, 3446, 3996) or 800 <= sic <= 899 or 2400 <= sic <= 2439 or 2450 <= sic <= 2459 or 2490 <= sic <= 2499 or 2950 <= sic <= 2952 or 3250 <= sic <= 3259 or 3270 <= sic <= 3275 or 3290 <= sic <= 3293 or 3295 <= sic <= 3299 or 3420 <= sic <= 3429 or 3430 <= sic <= 3433 or 3440 <= sic <= 3442 or 3448 <= sic <= 3452 or 3490 <= sic <= 3499 then 17 when 1500 <= sic <= 1511 or 1520 <= sic <= 1549 or 1600 <= sic <= 1799 then 18 when sic = 3300 or 3310 <= sic <= 3317 or 3320 <= sic <= 3325 or 3330 <= sic <= 3341 or 3350 <= sic <= 3357 or 3360 <= sic <= 3379 or 3390 <= sic <= 3399 then 19 when sic in (3400, 3443, 3444) or 3460 <= sic <= 3479 then 20 when sic in (3538, 3585, 3586) or 3510 <= sic <= 3536 or 3540 <= sic <= 3569 or 3580 <= sic <= 3582 or 3589 <= sic <= 3599 then 21 when sic in (3600, 3620, 3621, 3648, 3649, 3660, 3699) or 3610 <= sic <= 3613 or 3623 <= sic <= 3629 or 3640 <= sic <= 3646 or 3690 <= sic <= 3692 then 22 when sic in (2296, 2396, 3010, 3011, 3537, 3647, 3694, 3700, 3710, 3711, 3799) or 3713 <= sic <= 3716 or 3790 <= sic <= 3792 then 23 when sic in (3720, 3721, 3728, 3729) or 3723 <= sic <= 3725 then 24 when sic in (3730, 3731) or 3740 <= sic <= 3743 then 25 when sic = 3795 or 3760 <= sic <= 3769 or 3480 <= sic <= 3489 then 26 when 1040 <= sic <= 1049 then 27 when 1000 <= sic <= 1039 or 1050 <= sic <= 1119 or 1400 <= sic <= 1499 then 28 when 1200 <= sic <= 1299 then 29 when sic in (1300, 1389) or 1310 <= sic <= 1339 or 1370 <= sic <= 1382 or 2900 <= sic <= 2912 or 2990 <= sic <= 2999 then 30 when sic in (4900, 4910, 4911, 4939) or 4920 <= sic <= 4925 or 4930 <= sic <= 4932 or 4940 <= sic <= 4942 then 31 when sic in (4800, 4899) or 4810 <= sic <= 4813 or 4820 <= sic <= 4822 or 4830 <= sic <= 4841 or 4880 <= sic <= 4892 then 32 when sic in (7020, 7021, 7200, 7230, 7231, 7240, 7241, 7250, 7251, 7395, 7500, 7600, 7620, 7622, 7623, 7640, 7641) or 7030 <= sic <= 7033 or 7210 <= sic <= 7212 or 7214 <= sic <= 7217 or 7219 <= sic <= 7221 or 7260 <= sic <= 7299 or 7520 <= sic <= 7549 or 7629 <= sic <= 7631 or 7690 <= sic <= 7699 or 8100 <= sic <= 8499 or 8600 <= sic <= 8699 or 8800 <= sic <= 8899 or 7510 <= sic <= 7515 then 33 when sic in (3993, 7218, 7300, 7374, 7396, 7397, 7399, 7519, 8700, 8720, 8721) or 2750 <= sic <= 2759 or 7310 <= sic <= 7342 or 7349 <= sic <= 7353 or 7359 <= sic <= 7369 or 7376 <= sic <= 7385 or 7389 <= sic <= 7394 or 8710 <= sic <= 8713 or 8730 <= sic <= 8734 or 8740 <= sic <= 8748 or 8900 <= sic <= 8911 or 8920 <= sic <= 8999 or 4220 <= sic <= 4229 then 34 when sic = 3695 or 3570 <= sic <= 3579 or 3680 <= sic <= 3689 then 35 when sic = 7375 or 7370 <= sic <= 7373 then 36 when sic in (3622, 3810, 3812) or 3661 <= sic <= 3666 or 3669 <= sic <= 3679 then 37 when sic = 3811 or 3820 <= sic <= 3827 or 3829 <= sic <= 3839 then 38 when sic in (2760, 2761) or 2520 <= sic <= 2549 or 2600 <= sic <= 2639 or 2670 <= sic <= 2699 or 3950 <= sic <= 3955 then 39 when sic in (3220, 3221) or 2440 <= sic <= 2449 or 2640 <= sic <= 2659 or 3410 <= sic <= 3412 then 40 when sic in (4100. 4130, 4131, 4150, 4151, 4230, 4231, 4780, 4789) or 4000 <= sic <= 4013 or 4040 <= sic <= 4049 or 4110 <= sic <= 4121 or 4140 <= sic <= 4142 or 4170 <= sic <= 4173 or 4190 <= sic <= 4200 or 4210 <= sic <= 4219 or 4240 <= sic <= 4249 or 4400 <= sic <= 4700 or 4710 <= sic <= 4712 or 4720 <= sic <= 4749 or 4782 <= sic <= 4785 then 41 when sic in (5000, 5099, 5100) or 5010 <= sic <= 5015 or 5020 <= sic <= 5023 or 5030 <= sic <= 5060 or 5063 <= sic <= 5065 or 5070 <= sic <= 5078 or 5080 <= sic <= 5088 or 5090 <= sic <= 5094 or 5110 <= sic <= 5113 or 5120 <= sic <= 5122 or 5130 <= sic <= 5172 or 5180 <= sic <= 5182 or 5190 <= sic <= 5199 then 42 when sic in (5200, 5250, 5251, 5260, 5261, 5270, 5271, 5300, 5310, 5311, 5320, 5330, 5331, 5334, 5900, 5999) or 5210 <= sic <= 5231 or 5340 <= sic <= 5349 or 5390 <= sic <= 5400 or 5410 <= sic <= 5412 or 5420 <= sic <= 5469 or 5490 <= sic <= 5500 or 5510 <= sic <= 5579 or 5590 <= sic <= 5700 or 5710 <= sic <= 5722 or 5730 <= sic <= 5736 or 5750 <= sic <= 5799 or 5910 <= sic <= 5912 or 5920 <= sic <= 5932 or 5940 <= sic <= 5990 or 5992 <= sic <= 5995 then 43 when sic in (7000, 7213) or 5800 <= sic <= 5829 or 5890 <= sic <= 5899 or 7010 <= sic <= 7019 or 7040 <= sic <= 7049 then 44 when sic = 6000 or 6010 <= sic <= 6036 or 6040 <= sic <= 6062 or 6080 <= sic <= 6082 or 6090 <= sic <= 6100 or 6110 <= sic <= 6113 or 6120 <= sic <= 6179 or 6190 <= sic <= 6199 then 45 when sic in (6300, 6350, 6351, 6360, 6361) or 6310 <= sic <= 6331 or 6370 <= sic <= 6379 or 6390 <= sic <= 6411 then 46 when sic in (6500, 6510, 6540, 6541, 6610, 6611) or 6512 <= sic <= 6515 or 6517 <= sic <= 6532 or 6550 <= sic <= 6553 or 6590 <= sic <= 6599 then 47 when sic in (6700, 6798, 6799) or 6200 <= sic <= 6299 or 6710 <= sic <= 6726 or 6730 <= sic <= 6733 or 6740 <= sic <= 6779 or 6790 <= sic <= 6795 then 48 when sic in (4970, 4971, 4990, 4991) or 4950 <= sic <= 4961 then 49 else . end as ff49 from &data.; run; %end; %mend; * MACRO: CRSP_INDUSTRY Create daily historical SIC and NAICS industry identifiers dataset from CRSP data Arguments: out: name of output dataset; %macro crsp_industry(out=); /* Pull distinct date ranges and identifiers from CRSP datasets */ proc sql; create table permno0 as select distinct permno, permco, namedt, nameendt, siccd as sic, input(naics, 6.0) as naics/* NAICS codes can't have leading zeroes so this should be okay*/ from crsp.dsenames order by permno, namedt, nameendt; run; /* Alter missing industry identifiers */ data permno1; set permno0; if missing(sic) then sic = -999; if sic = 0 then sic = -999; if missing(naics) then naics = -999; run; /* Find date distance for date ranges */ data permno2; set permno1; permno_diff = intck('day', namedt, nameendt, 'd'); run; proc sort data = permno2; by permno namedt nameendt; run; /* Create new rows between valid dates */ data permno3; set permno2; output; n = 0; if permno_diff > 0 then do; do until(n = permno_diff); namedt = intnx('day', namedt, 1); n + 1; output; end; end; drop nameendt permno_diff n; run; /* Get ready for output */ data permno4; set permno3; if sic = -999 then sic = .; if naics = -999 then naics = .; date = namedt; drop namedt; format date yymmddn8.; run; proc sort data= permno4 out= &out. nodup; by permno date; run; proc delete data = permno0 permno1 permno2 permno3 permno4; run; %mend; * MACRO: COMP_SIC_NAICS Create a daily historical SIC and NAICS industry identifiers dataset using NA and global annual reports Arguments: OUT: name of output dataset; %macro comp_sic_naics(OUT =, ff_num =); proc sql; /* Retrieve NA identifiers */ create table comp1 as select distinct gvkey, datadate, sich as sic, naicsh as naics from COMP.FUNDA; run; /* Fix error of gvkey code 175650 */ data comp2; set comp1; if gvkey = "175650" and datadate = '31DEC2005'd and missing(naics) then delete; run; proc sql; /* Retrieve global identifiers */ create table comp3 as select distinct gvkey, datadate, sich as sic, naicsh as naics from COMP.G_FUNDA; /* Join global and NA data */ create table comp4 as select a.gvkey as gvkeya, a.datadate as datea, a.sic as sica, a.naics as naicsa, b.gvkey as gvkeyb, b.datadate as dateb, b.sic as sicb, b.naics as naicsb from comp2 as a full join comp3 as b on a.gvkey = b.gvkey and a.datadate = b.datadate; run; /* Coalesce NA and global */ data comp5; set comp4; gvkey = put(coalesce(gvkeya, gvkeyb), $z6.); date = coalesce(datea, dateb); sic = coalesce(sica, sicb); naics = coalesce(naicsa, naicsb); format date yymmddn8.; drop gvkeya gvkeyb datea dateb sica sicb naicsa naicsb; run; /* Sort descending*/ proc sort data = comp5; by gvkey descending date; run; /* Add valid date to in order to extend to daily observation */ data comp6; set comp5; by gvkey; valid_to = intnx('day', lag(date), -1); if FIRST.gvkey then do; valid_to = date; end; format valid_to yymmddn8.; run; /* Re-sort */ proc sort data = comp6; by gvkey date valid_to; run; /* Find date distance for date ranges */ data comp7; set comp6; comp_diff = intck('day', date, valid_to, 'd'); run; proc sort data = comp7; by gvkey date valid_to; run; /* Create new rows between valid dates */ data comp8; set comp7; output; n = 0; if comp_diff > 0 and comp_diff ne . then do; do until(n = comp_diff); date = intnx('day', date, 1); n + 1; output; end; end; drop valid_to comp_diff n; run; proc sort data= comp8 out= &out. nodup; by gvkey date; run; proc delete data = comp1 comp2 comp3 comp4 comp5 comp6 comp7 comp8; run; %mend; * MACRO: COMP_HGICS Create a daily historical gics dataset from COMPUSTAT, either from the NA or global dataset Arguments: lib: COMPUSTAT library from which to pull historical gics data (CO_HGICS if NA, G_CO_HGICS if global) OUT: name of output dataset; %macro COMP_HGICS(lib =, out =); /* Pull historical gics data */ proc sql; create table gic1 as select distinct gvkey, indfrom, indthru, gsubind as gics from comp.&lib. where not missing(gvkey); run; proc sort data = gic1; by gvkey indfrom; run; /* Alter missing gics */ data gic2; set gic1; by gvkey; if missing(gics) then gics = -999; run; /* Adjust indthru */ data gic3; set gic2; by gvkey indfrom indthru; if LAST.gvkey and indthru = . then indthru = today(); run; /* Estimate difference between indfrom and indthru */ data gic4; set gic3; gic_diff = intck('days', indfrom, indthru); run; proc sort data = gic4; by gvkey indfrom indthru; run; /* Add rows to create daily data */ data gic5; set gic4; by gvkey; output; n = 0; if gic_diff > 0 and gic_diff ne . then do; do until(n = gic_diff); indfrom = intnx('day', indfrom, 1); n + 1; output; end; end; run; data gic6; set gic5; date = indfrom; format date yymmddn8.; drop indfrom indthru gic_diff n; run; proc sort data= gic6 out=&OUT nodup; by gvkey date; run; proc delete data = gic1 gic2 gic3 gic4 gic5 gic6; run; %mend COMP_HGICS; /* MACRO: HGICS JOIN Join NA and global daily historical gics data from COMPUSTAT Argument: OUT: name of output dataset */ %macro HGICS_JOIN(out=); /* Construct NA and global historical gics data */ %comp_hgics(lib = co_hgic, OUT = na_hgics); %comp_hgics(lib = g_co_hgic, OUT = g_hgics); proc sql; create table gjoin1 as select a.gvkey as na_gvkey, a.gics as na_gics, a.date as na_date, b.gvkey as g_gvkey, b.gics as g_gics, b.date as g_date from na_hgics as a full join g_hgics as b on a.gvkey = b.gvkey and a.date = b.date; /* Coalesce NA and global */ data gjoin2; set gjoin1; gvkey = put(coalesce(na_gvkey, g_gvkey), $z6.); date = coalesce(na_date, g_date); gics = coalesce(na_gics, g_gics); format date yymmddn8.; drop na_gvkey na_date na_gics g_gvkey g_date g_gics; run; proc sort data = gjoin2 out= &out nodup; by gvkey date; run; proc delete data = na_hgics g_hgics gjoin1 gjoin2; run; %mend; * MACRO: COMP_INDUSTRY Join SIC and NAICS industry identifiers to GICS identifiers constructed from COMPUSTAT data Arguments: OUT: name of output dataset; %macro comp_industry(out=); /* Construct datasets */ %hgics_join(out=comp_gics); %comp_sic_naics(out=comp_other); /* Join datasets */ data join1; merge comp_gics comp_other; by gvkey date; run; proc sort data = join1 nodupkey; by gvkey date; run; /* Check for gaps in coverage */ data join2; set join1; by gvkey date; lagdate = lag(date); date_1 = intnx('day', date, -1); gap = 0; format lagdate yymmddn8. date_1 yymmddn8.; run; data join3; set join2; by gvkey date; if not FIRST.gvkey and lagdate ne date_1 then gap = 1; run; /* Create rows for gaps in coverage with all indicators as missing */ proc sql; create table gap1 as select * from join3 where gap = 1; run; /* Size of gap */ data gap2; set gap1; diff = intck('days', lagdate, date); run; /* Add rows to create daily data */ data gap3; set gap2; by gvkey date; output; n = 0; if gap = 1 then do; do until(n = diff - 1); date = intnx('day', date, -1); gics = .; sic = .; naics = .; n + 1; output; end; end; drop lagdate date_1 gap diff n; run; proc sort data = gap3; by gvkey date; run; /* Join added rows to original daily data */ data joined1; merge join1 gap3; by gvkey date; run; proc sort data = joined1 out= &out. nodup; by gvkey date; run; proc delete data = comp_gics comp_other join1 join2 join3 gap1 gap2 gap3 joined1; run; %mend; ================================================ FILE: GlobalFactors/main.sas ================================================ /* Clean working environment */ proc delete data = _all_ ; run ; *************************************************************************** * Manual Inputs *************************************************************************** ; * Assign scratch and project folder names; %let scratch_folder = /scratch/INSTITUTION/FOLDER; %let project_folder = ~/Global Data; * Set defaults; %let delete_temp = 1; * Should temporary files be deleted?; %let save_csv = 1; * Should the main data set be save country-by-country in a .csv format?; %let save_daily_ret = 1; * Save daily stocks returns country-by-country in a .csv format?; %let save_monthly_ret = 1; * Save monthly stocks returns in a .csv format; %let end_date = '31DEC2024'd; * Date of last observation: CRSP data is only updated annually, so we keep this updating frequency for consistency. Should be incremented every time there's an update to the CRSP database); *************************************************************************** * Libraries and Functions *************************************************************************** ; * Libraries; options dlcreatedir; libname scratch "&scratch_folder."; libname project "&project_folder."; * Project macros; %include "&project_folder./project_macros.sas"; %include "&project_folder./char_macros.sas"; %include "&project_folder./market_chars.sas"; %include "&project_folder./accounting_chars.sas"; %include "&project_folder./ind_identification.sas"; ***************************************************************************** * Create Return Data **************************************************************************** ; %prepare_comp_sf(freq=both); %clean_comp_msf(data=comp_msf); * Delete obvious data errors (work-in-progress); %prepare_crsp_sf(freq=d); %prepare_crsp_sf(freq=m); %combine_crsp_comp_sf(out_msf=world_msf1, out_dsf=scratch.world_dsf, crsp_msf=crsp_msf, comp_msf=comp_msf, crsp_dsf=crsp_dsf, comp_dsf=comp_dsf); proc delete data=comp_dsf crsp_dsf comp_msf crsp_msf; run; ***************************************************************************** * Add Industry Codes ***************************************************************************** ; %crsp_industry(out=crsp_ind); %comp_industry(out=comp_ind); proc sql; create table world_msf2 as select a.*, b.gics as gics, coalesce(b.sic, c.sic) as sic, coalesce(b.naics, c.naics) as naics from world_msf1 as a left join comp_ind as b on a.gvkey=b.gvkey and a.eom=b.date left join crsp_ind as c on a.permco=c.permco and a.permno=c.permno and a.eom=c.date; quit; proc delete data=world_msf1 crsp_ind comp_ind; run; * Prefer COMPUSTAT to CRSP; * Add a column 'ff49' with Fama-French industry classification; %ff_ind_class(data=world_msf2, ff_grps=49, out=world_msf3); * Size cutoffs; %nyse_size_cutoffs(data=world_msf3, out=scratch.nyse_cutoffs); * Classify stocks into size groups; proc sql; create table scratch.world_msf as select case when missing(a.me) then ('') when a.me >= b.nyse_p80 then 'mega' when a.me >= b.nyse_p50 then 'large' when a.me >= b.nyse_p20 then 'small' when a.me >= b.nyse_p1 then 'micro' else 'nano' end as size_grp, a.* from world_msf3 as a left join scratch.nyse_cutoffs as b on a.eom=b.eom; quit; proc delete data=world_msf2 world_msf3; run; * Return cutoffs; %return_cutoffs(data=scratch.world_msf, freq=m, out=scratch.return_cutoffs, crsp_only=0); %return_cutoffs(data=scratch.world_dsf, freq=d, out=scratch.return_cutoffs_daily, crsp_only=0); ***************************************************************************** * Market Returns **************************************************************************** ; %market_returns(out = scratch.market_returns, data=scratch.world_msf, freq=m, wins_comp=1, wins_data=scratch.return_cutoffs, cap_data=scratch.nyse_cutoffs); %market_returns(out = scratch.market_returns_daily, data=scratch.world_dsf, freq=d, wins_comp=1, wins_data=scratch.return_cutoffs_daily, cap_data=scratch.nyse_cutoffs); ***************************************************************************** * Create Characteristics Based on Accounting Data **************************************************************************** ; %standardized_accounting_data(coverage='world', convert_to_usd=1, me_data = scratch.world_msf, include_helpers_vars=1, start_date='31DEC1949'd); %create_acc_chars(data=acc_std_ann, out=achars_world, lag_to_public=4, max_data_lag=18, __keep_vars=&acc_chars., me_data=scratch.world_msf, suffix=); %create_acc_chars(data=acc_std_qtr, out=qchars_world, lag_to_public=4, max_data_lag=18, __keep_vars=&acc_chars., me_data=scratch.world_msf, suffix=_qitem); %combine_ann_qtr_chars(out=scratch.acc_chars_world, ann_data=achars_world, qtr_data=qchars_world, __char_vars=&acc_chars., q_suffix=_qitem); ***************************************************************************** * Create Characteristics Based on Monthly Market Data **************************************************************************** ; %market_chars_monthly(out=scratch.market_chars_m, data=scratch.world_msf, market_ret=scratch.market_returns, local_currency=0); * Free up space; proc datasets library=work kill nolist; quit; ***************************************************************************** * Combine Returns, Accounting and Monthly Market Data **************************************************************************** ; proc sql; create table world_data_prelim as select a.*, b.*, c.* from scratch.world_msf as a left join scratch.market_chars_m as b on a.id=b.id and a.eom=b.eom left join scratch.acc_chars_world as c on a.gvkey=c.gvkey and a.eom=c.public_date; alter table world_data_prelim drop div_tot, div_cash, div_spc, public_date, source; quit; %if &delete_temp.=1 %then %do; proc delete data= scratch.market_chars_m scratch.acc_chars_world; run; %end; ***************************************************************************** * Asset Pricing Factors **************************************************************************** ; * Create monthly and daily factors from FF3 and HXZ4; %ap_factors(out=scratch.ap_factors_daily, freq=d, sf=scratch.world_dsf, mchars=world_data_prelim, mkt=scratch.market_returns_daily, min_stocks_bp=10, min_stocks_pf=3); %ap_factors(out=scratch.ap_factors_monthly, freq=m, sf=scratch.world_msf, mchars=world_data_prelim, mkt=scratch.market_returns, min_stocks_bp=10, min_stocks_pf=3); ***************************************************************************** * Factor based on combined data **************************************************************************** ; %firm_age(data=scratch.world_msf, out=scratch.firm_age); %mispricing_factors(out=scratch.mp_factors, data=world_data_prelim, min_stks=10, min_fcts=3); %market_beta(out=scratch.beta_60m, data=scratch.world_msf, fcts=scratch.ap_factors_monthly, __n=60, __min=36); %residual_momentum(out=scratch.resmom_ff3, data=scratch.world_msf, fcts=scratch.ap_factors_monthly, type=ff3, __n =36, __min=24, incl=12 6, skip=1 1); * Free up space; proc datasets library=work nolist; delete _all_ / memtype=data; /* Deletes all datasets */ protect world_data_prelim; /* Prevents world_data_prelim from being deleted */ quit; ***************************************************************************** * Create Characteristics Based on Daily Market Data **************************************************************************** ; %bidask_hl(out=scratch.corwin_schultz, data=scratch.world_dsf, __min_obs=10); %prepare_daily(data=scratch.world_dsf, fcts=scratch.ap_factors_daily); %roll_apply_daily(out=scratch.roll_21d, __n=1, __min=15, fcts=scratch.ap_factors_daily, __month_ends=month_ends, sfx =_21d, __stats= rvol rmax skew capm_ext ff3 hxz4 dimsonbeta zero_trades); %roll_apply_daily(out=scratch.roll_126d, __n=6, __min=60, fcts=scratch.ap_factors_daily, __month_ends=month_ends, sfx =_126d, __stats= zero_trades turnover dolvol ami); %roll_apply_daily(out=scratch.roll_252d, __n=12, __min=120, fcts=scratch.ap_factors_daily, __month_ends=month_ends, sfx =_252d, __stats= rvol capm downbeta zero_trades prc_to_high mktvol); %roll_apply_daily(out=scratch.roll_1260d, __n=60, __min=750, fcts=scratch.ap_factors_daily, __month_ends=month_ends, sfx =_1260d, __stats= mktcorr); %finish_daily_chars(out=scratch.market_chars_d); %if &delete_temp.=1 %then %do; proc delete data= scratch.corwin_schultz scratch.roll_21d scratch.roll_126d scratch.roll_252d scratch.roll_1260d scratch.ap_factors_daily scratch.ap_factors_monthly ; run; %end; * Free up space; proc datasets library=work nolist; delete _all_ / memtype=data; /* Deletes all datasets */ protect world_data_prelim; /* Prevents world_data_prelim from being deleted */ quit; ***************************************************************************** * Combine all characteristics and build final dataset **************************************************************************** ; proc sql; create table world_data3 as select a.*, b.beta_60m, b.ivol_capm_60m, c.resff3_12_1, d.resff3_6_1, e.mispricing_mgmt, e.mispricing_perf, f.*, g.age from world_data_prelim as a left join scratch.beta_60m as b on a.id=b.id and a.eom=b.eom left join scratch.resmom_ff3_12_1 as c on a.id=c.id and a.eom=c.eom left join scratch.resmom_ff3_6_1 as d on a.id=d.id and a.eom=d.eom left join scratch.mp_factors as e on a.id=e.id and a.eom=e.eom left join scratch.market_chars_d as f on a.id=f.id and a.eom=f.eom left join scratch.firm_age as g on a.id=g.id and a.eom=g.eom; quit; * Add Quality minus Junk; %quality_minus_junk(out=scratch.qmj, data=world_data3, min_stks=10); proc sql; create table world_data4 as select a.*, b.qmj, b.qmj_prof, b.qmj_growth, b.qmj_safety from world_data3 as a left join scratch.qmj as b on a.excntry=b.excntry and a.id=b.id and a.eom=b.eom; quit; * Reorder Variables; data world_data5; retain id date eom source_crsp size_grp obs_main exch_main primary_sec gvkey iid permno permco excntry curcd fx common comp_tpci crsp_shrcd comp_exchg crsp_exchcd gics sic naics ff49 adjfct shares me me_company prc prc_local dolvol ret ret_local ret_exc ret_lag_dif ret_exc_lead1m market_equity enterprise_value book_equity assets sales net_income; set world_data4; run; * Delete Temporary Files; %if &delete_temp.=1 %then %do; proc delete data= world_data_prelim scratch.beta_60m scratch.qmj scratch.resmom_ff3_12_1 scratch.resmom_ff3_6_1 scratch.mp_factors scratch.firm_age scratch.market_chars_d; run; %end; * Save combined data; proc sort data=world_data5 out=scratch.world_data nodup; by id eom; run; ***************************************************************************** * Create Output in .csv Format for Download **************************************************************************** ; * Create Output Folder; options dlcreatedir; libname op "&scratch_folder./output"; option nonotes; * Small Files; proc export data=scratch.market_returns_daily outfile="&scratch_folder./output/market_returns_daily.csv" dbms=CSV replace; run; proc export data=scratch.market_returns outfile="&scratch_folder./output/market_returns.csv" dbms=CSV replace; run; proc export data=scratch.nyse_cutoffs outfile="&scratch_folder./output/nyse_cutoffs.csv" dbms=CSV replace; run; proc export data=scratch.return_cutoffs outfile="&scratch_folder./output/return_cutoffs.csv" dbms=CSV replace; run; proc export data=scratch.return_cutoffs_daily outfile="&scratch_folder./output/return_cutoffs_daily.csv" dbms=CSV replace; run; option notes; * Save main data as .csv files by country; %if &save_csv.=1 %then %do; %save_main_data_csv(out=Characteristics, data=scratch.world_data, path=&scratch_folder./output, end_date=&end_date.); %end; * Save daily return data as .csv files by country; %if &save_daily_ret.=1 %then %do; %save_daily_ret_csv(out=Daily Returns, data=scratch.world_dsf, path=&scratch_folder./output, end_date=&end_date.); %end; * Save monthly return data as .csv files by country; %if &save_monthly_ret.=1 %then %do; %save_monthly_ret_csv(out=world_ret_monthly, data=scratch.world_msf, path=&scratch_folder./output, end_date=&end_date.); %end; * Delete Temporary Files; %if &delete_temp.=1 %then %do; proc delete data= scratch.market_returns_daily scratch.market_returns scratch.nyse_cutoffs scratch.return_cutoffs scratch.return_cutoffs_daily scratch.world_dsf scratch.world_msf scratch.world_data; run; %end; ================================================ FILE: GlobalFactors/market_chars.sas ================================================ * Market Chars: Monthly; %let monthly_chars= /* Market Based Size Variables */ market_equity /* Total Dividend Paid to Market Equity */ div1m_me div3m_me div6m_me div12m_me /* Special Dividend Paid to Market Equity */ divspc1m_me divspc12m_me /* Change in Shares Outstanding */ chcsho_1m chcsho_3m chcsho_6m chcsho_12m /* Net Equity Payout */ eqnpo_1m eqnpo_3m eqnpo_6m eqnpo_12m /* Momentum/Reversal */ ret_1_0 ret_2_0 ret_3_0 ret_3_1 ret_6_0 ret_6_1 ret_9_0 ret_9_1 ret_12_0 ret_12_1 ret_12_7 ret_18_1 ret_24_1 ret_24_12 ret_36_1 ret_36_12 ret_48_1 ret_48_12 ret_60_1 ret_60_12 ret_60_36 /* Seasonality */ seas_1_1an seas_2_5an seas_6_10an seas_11_15an seas_16_20an seas_1_1na seas_2_5na seas_6_10na seas_11_15na seas_16_20na ; %put ### In total %nwords(&monthly_chars.) monthly characteristics will be created ###; %macro market_chars_monthly(out=, data=, market_ret=, local_currency=); %if &local_currency=1 %then %do; %let ret_var = ret_local; %end; %if &local_currency=0 %then %do; %let ret_var = ret; %end; /* Helper macro: Apply function lag0 to lag &n. */ %macro apply_to_lastn(x=, _n=, func=); %let mv = &func.(&x.; %do _i=1 %to &_n.-1; %let mv = &mv., lag&_i.(&x.); %end; %let mv = &mv.); &mv.; %mend apply_to_lastn; /* Get Important Variables */ proc sql; create table __monthly_chars1 as select a.id, a.date, a.eom, a.me, a.shares, a.adjfct, a.prc, a.ret, a.ret_local, a.&ret_var. as ret_x, a.div_tot, a.div_cash, a.div_spc, a.dolvol, a.ret_lag_dif, (a.ret_local = 0) as ret_zero, a.ret_exc, b.mkt_vw_exc /* Currently, Excess return is in USD because we lack RF for most markets */ from &data. as a left join &market_ret. as b on a.excntry=b.excntry and a.eom=b.eom order by a.id, a.eom; quit; * Ensure that there is a lag of 1 month between each obs; proc sql; create table __stock_coverage as select id, min(eom) as start_date, max(eom) as end_date from __monthly_chars1 group by id; quit; %expand(data=__stock_coverage, out=__full_range, id_vars=id, start_date=start_date, end_date=end_date, freq='month', new_date_name=eom); proc sql; create table __monthly_chars2 as select a.id, a.eom, missing(b.id) as obs_miss, b.me, b.shares, b.adjfct, b.prc, b.ret, b.ret_local, b.ret_x, b.ret_lag_dif, b.div_tot, b.div_cash, b.div_spc, b.dolvol, b.ret_zero, b.ret_exc, b.mkt_vw_exc from __full_range as a left join __monthly_chars1 as b on a.id=b.id and a.eom=b.eom order by id, eom; quit; * Cummulative Return Index; data __monthly_chars3; set __monthly_chars2; by id; retain ri_x; /* Local or USD depending on &local_currency.*/ retain ri; /* USD */ retain count; if first.id then do; ri_x = sum(1, ret_x); /* Most will have missing return for the first observation. In that case this evaluates to 1*/ ri = sum(1, ret); count = 1; end; else do; ri_x = ri_x*sum(1, ret_x); /* By using sum instead of 1+ret missing returns are set to 0 */ ri = ri*sum(1, ret); count = count+1; end; run; /* Set non-standard returns to missing */ data __monthly_chars4; set __monthly_chars3; ret_miss = missing(ret_x) or ret_lag_dif^=1; /* We set returns with more than one month between price observations to missing (only impact Compustat data) */ if ret_miss = 1 then do; ret_x = .; ret=.; ret_local=.; ret_exc =.; mkt_vw_exc = .; end; drop obs_miss ret_zero ret_lag_dif; run; /* Create variables */ proc sort nodup data=__monthly_chars4; by id eom; run; %macro temp(); data __monthly_chars5; set __monthly_chars4; by id eom; /* Market Equity */ market_equity = me; /* Dividend to Price */ %let div_range = 1 3 6 12; /* 24, 36*/ %do i=1 %to %sysfunc(countw(&div_range.)); %let n = %scan(&div_range., &i.); div_sum = %apply_to_lastn(x=div_tot*shares, _n=&n., func=sum); div&n.m_me = div_sum/me; if count < &n. then div&n.m_me = .; drop div_sum; %end; /* Special Dividends */ %let div_spc_range = 1 12; %do i=1 %to %sysfunc(countw(&div_spc_range.)); %let n = %scan(&div_spc_range., &i.); div_spc_sum = %apply_to_lastn(x=div_spc*shares, _n=&n., func=sum); divspc&n.m_me = div_spc_sum/me; if count < &n. then divspc&n.m_me = .; drop div_spc_sum; %end; /* Change in Shares Outstanding (Market Based Proxy for Net Share Issuance)*/ %let chcsho_lags = 1 3 6 12; %do i=1 %to %sysfunc(countw(&chcsho_lags.)); %let chcsho_lag = %scan(&chcsho_lags.,&i.); chcsho_&chcsho_lag.m = (shares*adjfct)/lag&chcsho_lag.(shares*adjfct)-1; if count <= &chcsho_lag. then chcsho_&chcsho_lag.m=.; %end; /* Net Equity Payout (Market based stock buyback+dividend-stock issuance)*/ %let eqnpo_lags = 1 3 6 12; %do i=1 %to %sysfunc(countw(&eqnpo_lags.)); %let eqnpo_lag = %scan(&eqnpo_lags.,&i.); eqnpo_&eqnpo_lag.m = log(ri/lag&eqnpo_lag.(ri))-log(me/lag&eqnpo_lag.(me)); if count <= &eqnpo_lag. then eqnpo_&eqnpo_lag.m=.; %end; /* Momentum/Reversal */ %let from_lags = 1 2 3 3 6 6 9 9 12 12 12 18 24 24 36 36 48 48 60 60 60; %let to_lags = 0 0 0 1 0 1 0 1 0 1 7 1 1 12 1 12 12 1 1 12 36; %do j=1 %to %sysfunc(countw(&from_lags.)); %let from = %scan(&from_lags., &j.); %let to = %scan(&to_lags., &j.); ret_&from._&to. = lag&to.(ri_x)/lag&from.(ri_x)-1; if count <= &from. or missing(lag&to.(ret_x)) then /* Require the last return observation to be non-missing */ ret_&from._&to.=.; %end; /* Seasonality: Heston and Sadka (2008) */ %seasonality(start_year=1, end_year=1); %seasonality(start_year=2, end_year=5); %seasonality(start_year=6, end_year=10); %seasonality(start_year=11, end_year=15); %seasonality(start_year=16, end_year=20); /* Drop Uneccesary Variables */ drop me shares adjfct shares adjfct prc ret ret_local ret_x div_tot div_cash div_spc dolvol ret_exc mkt_vw_exc ret_miss ri_x ri count; run; %mend; %temp(); proc sort data=__monthly_chars5 out=&out.; by id eom; run; proc delete data=__stock_coverage __full_range __monthly_chars1 __monthly_chars2 __monthly_chars3 __monthly_chars4 __monthly_chars5; run; %mend; /* Calculate CAPM beta over a rolling window */ %macro market_beta(out=, data=, fcts=, __n =, __min=); proc sql; create table __msf1 as select a.id, a.eom, a.ret_exc, a.ret_lag_dif, b.mktrf from &data. as a left join &fcts. as b on a.excntry=b.excntry and a.eom=b.eom where a.ret_local^=0 and not missing(a.ret_exc) and a.ret_lag_dif=1 and not missing(b.mktrf); quit; %winsorize_own(inset=__msf1, outset=__msf2, sortvar=eom, vars=ret_exc, perc_low=0.1, perc_high=99.9); /* Winsorize returns at 0.1% and 99.9% */ proc sort data=__msf2; by id eom; run; proc sql; create table month_ends as select distinct eom from __msf2 order by eom; quit; * Divide data into __n groups; proc sql; create table dates_apply as select *, mod(monotonic(), &__n.) as grp from month_ends; quit; * Helper macro: If first group, save &new. as &base. otherwise, append &new. to &base.; %macro save_or_append(base=, new=); %if &__grp. = 0 %then %do; data &base.; set &new.; run; %end; %else %do; proc append base=&base. data=&new.; run; %end; %mend; %do __grp=0 %to %eval(&__n. - 1); %put ############### GROUP %eval(&__grp.+1) out of &__n. ###############; * Prepare data; proc sql; create table calc_dates as select a.eom, b.eom as calc_date from dates_apply as a left join dates_apply(where=(grp = &__grp.)) as b on a.eom > intnx("month", b.eom, -&__n., "e") and a.eom <= b.eom; quit; proc sql; create table calc_data as select a.*, b.calc_date from __msf2 as a left join calc_dates as b on a.eom = b.eom where not missing(b.calc_date) group by a.id, b.calc_date having count(*) >= &__min. order by a.id, b.calc_date; quit; proc reg data=calc_data outest=__capm1 edf NOPRINT; by id calc_date; model ret_exc=mktrf; run; proc sql; create table __capm2 as select id, calc_date as eom, mktrf as beta_&__n.m, sqrt(_rmse_**2 * _edf_ / (_edf_ + 1)) as ivol_capm_&__n.m from __capm1 where (_edf_ + 2) >= &__min.; quit; %save_or_append(base=op_capm, new=__capm2); %end; proc sort data=op_capm out=&out. nodup; by id eom; run; proc delete data=op_capm; run; %mend; /* MACRO: RESIDUAL MOMENTUM -------------- - Rolling regressions over &__n. months. used to calculate residual momentum. - Currently I have only implemented FF3 but I could easily extend it to CAPM and HXZ4 - Residual momentum is typically calculated over a shorter horizon than that used to estimate parameters. The number of months to include return data from, is indicated by &incl. The number of months to skip within that period, is indicated by &skip. Both incl and skip can be list, but they need to be of the same length, */ %macro residual_momentum(out=, data=, fcts=, type=, __n =, __min=, incl=, skip=); /* first_lag and last_lag can be a list but must have equal length. Type in (market, ff3, hxz4) */ proc sql; create table __msf1 as select a.id, a.eom, a.ret_exc, a.ret_lag_dif, b.mktrf, b.hml, b.smb_ff, b.roe, b.inv, b.smb_hxz from &data. as a left join &fcts. as b on a.excntry=b.excntry and a.eom=b.eom where a.ret_local^=0 and not missing(a.ret_exc) and not missing(b.mktrf) and ret_lag_dif=1; quit; %winsorize_own(inset=__msf1, outset=__msf2, sortvar=eom, vars=ret_exc, perc_low=0.1, perc_high=99.9); /* Winsorize returns at 0.1% and 99.9% */ proc sort data=__msf2; by id eom; run; proc sql; create table month_ends as select distinct eom from __msf2 order by eom; quit; * Divide data into __n groups; proc sql; create table dates_apply as select *, mod(monotonic(), &__n.) as grp from month_ends; quit; * Helper macro: If first group, save &new. as &base. otherwise, append &new. to &base.; %macro save_or_append(base=, new=); %if &__grp. = 0 %then %do; data &base.; set &new.; run; %end; %else %do; proc append base=&base. data=&new.; run; %end; %mend; %do __grp=0 %to %eval(&__n. - 1); %put ############### GROUP %eval(&__grp.+1) out of &__n. ###############; * Prepare data; proc sql; create table calc_dates as select a.eom, b.eom as calc_date from dates_apply as a left join dates_apply(where=(grp = &__grp.)) as b on a.eom > intnx("month", b.eom, -&__n., "e") and a.eom <= b.eom; quit; proc sql; create table calc_data as select a.*, b.calc_date from __msf2 as a left join calc_dates as b on a.eom = b.eom where not missing(b.calc_date) group by a.id, b.calc_date having count(*) >= &__min. order by a.id, b.calc_date; quit; * Fama and French (1993) 3 factor model; %if %sysfunc(find(&type., ff3)) >= 1 %then %do; proc reg data=calc_data(where=(not missing(hml) and not missing(smb_ff))) NOPRINT; by id calc_date; model ret_exc=mktrf smb_ff hml; output out=__ff3_res1 residual=res; run; %do __i=1 %to %nwords(&incl.); %let __in = %scan(&incl., &__i., %str(' ')); %let __sk = %scan(&skip., &__i., %str(' ')); proc sql; create table __ff3_res2 as select *, (eom > intnx("month", calc_date, -&__in., "e") and eom <= intnx("month", calc_date, -&__sk., "e")) as incl /* Incl=1 --> obs is included in momentum calculation */ from __ff3_res1 group by id, calc_date having count(res) >= &__min. order by id, calc_date, eom; create table __ff3_res3 as select id, calc_date as eom, mean(res) / std(res) as resff3_&__in._&__sk. from __ff3_res2 where incl = 1 group by id, calc_date; quit; %save_or_append(base=op_&__in._&__sk., new=__ff3_res3); %end; %end; %end; /* Output */ %do __i=1 %to %nwords(&incl.); %let __in = %scan(&incl., &__i., %str(' ')); %let __sk = %scan(&skip., &__i., %str(' ')); proc sort data=op_&__in._&__sk. out=&out._&__in._&__sk. nodup; by id eom; run; proc delete data=op_&__in._&__sk.; run; %end; %mend; /* MACRO FOR DAILY CHARS ---------------------------------------------- */ %macro prepare_daily(data=, fcts=); /* Start timer */ %let __prep_start = %sysfunc(datetime()); /* Prepare stock level data */ proc sql; create table dsf1 as select a.excntry, a.id, a.date, a.eom, a.prc / a.adjfct as prc_adj, a.ret, a.ret_exc, a.dolvol as dolvol_d, a.shares, a.tvol, b.mktrf, b.hml, b.smb_ff, b.roe, b.inv, b.smb_hxz, a.ret_lag_dif, a.bidask, sum(a.ret_local = 0) as zero_obs /* Some firms have almost inclusively zero returns. These should be excluded */ from &data. as a left join &fcts. as b on a.excntry = b.excntry and a.date = b.date where not missing(b.mktrf) /* not missing mktrf ensures that we look at trading days*/ group by a.id, a.eom; update dsf1 set ret_exc = ., ret = . where ret_lag_dif > 14; /* Only used returns based on prices that are not more than two weeks old */ alter table dsf1 drop ret_lag_dif, bidask; quit; proc sort data=dsf1; by id date; run; * Create lead/lagged market returns (For dimson beta); proc sql; create table mkt_lead_lag1 as select excntry, date, intnx('month',date,0,'E') as eom format=YYMMDDN8., mktrf from &fcts. order by excntry, date desc; quit; data mkt_lead_lag2; set mkt_lead_lag1; mktrf_ld1 = lag(mktrf); if excntry ^= lag(excntry) or eom ^= lag(eom) then mktrf_ld1 = .; /* Eom condition is to avoid look-ahead bias */ run; proc sort data=mkt_lead_lag2 out=mkt_lead_lag3; by excntry date; run; data mkt_lead_lag4; set mkt_lead_lag3; mktrf_lg1 = lag(mktrf); if excntry ^= lag(excntry) then mktrf_lg1 = .; run; * Overlapping returns used to calculate correlation; data corr_data; set dsf1; ret_exc_3l = ret_exc + lag(ret_exc) + lag2(ret_exc); mkt_exc_3l = mktrf + lag(mktrf) + lag2(mktrf); if id ^= lag2(id) then do; ret_exc_3l = .; mkt_exc_3l = .; end; keep id eom zero_obs ret_exc_3l mkt_exc_3l; run; *Unique Month Ends; proc sql; create table month_ends as select distinct eom from dsf1 order by eom; quit; /* Stop timer */ data _null_; dur = datetime() - &__prep_start; put 30*'-' / ' PREPARING DAILY DATA TOOK:' dur time13.2 / 30*'-'; run; %mend; * MACRO: ROLL APPLY DAILY ------------------------- - Apply &__stats. functions to rolling windows of data. - The idea is to iteratively apply the functions to &__n. different splits of the data. The output of each function is a stock id-eom pair plus the calculated characteristics - The currently implemented &__stats. are: * rvol, rmax, skew, capm, capm_ext, ff3, hxz4, dimsonbeta, downbeta, zero_trades, turnover, dolvol, ami, prc_to_high, mktcorr, mktvol - Arguments: * OUT: Output dataset in a long format with all the requested characteristics * ...; %macro roll_apply_daily(out=, __n=, __min=, fcts=, __month_ends=, sfx =,__stats=); /* Create stats over rolling __n months. stats in (rvol, rmax, skew, capm, capm_ext, ff3, hxz4, dimsonbeta, downbeta, zero_trades, turnover, dolvol, ami, prc_to_high, mktcorr, mktvol) */ /* Start timer */ %let __roll_start = %sysfunc(datetime()); * Divide data into __n groups; proc sql; create table dates_apply as select *, mod(monotonic(), &__n.) as grp from &__month_ends.; quit; * Helper: If first group, save &new. as &base. otherwise, append &new. to &base.; %macro save_or_append(base=, new=); %if &__grp. = 0 %then %do; data &base.; set &new.; run; %end; %else %do; proc append base=&base. data=&new.; run; %end; %mend; * Drop unneccesary columns for faster join; data __input; set dsf1; run; %if %sysfunc(find(&__stats., ff3)) = 0 %then %do; proc sql; alter table __input drop hml, smb_ff; quit; %end; %if %sysfunc(find(&__stats., hxz4)) = 0 %then %do; proc sql; alter table __input drop roe, inv, smb_hxz; quit; %end; %if %sysfunc(find(&__stats., turnover)) = 0 and %sysfunc(find(&__stats., ami)) = 0 and %sysfunc(find(&__stats., zero_trades)) = 0 and %sysfunc(find(&__stats., dolvol)) = 0%then %do; proc sql; alter table __input drop dolvol_d, shares, tvol; quit; %end; %if %sysfunc(find(&__stats., prc_to_high)) = 0 %then %do; proc sql; alter table __input drop prc_adj; quit; %end; * Apply __stats to each group; %do __grp=0 %to %eval(&__n. - 1); * Prepare data; proc sql; create table calc_dates as select a.eom, b.eom as calc_date from dates_apply as a left join dates_apply(where=(grp = &__grp.)) as b on a.eom > intnx("month", b.eom, -&__n., "e") and a.eom <= b.eom; quit; * Not neccesary if mktcorr is the only stat; %if %nwords(&__stats.)>1 or %sysfunc(find(&__stats., mktcorr))=0 %then %do; proc sql; /* Used for volume variables */ create table calc_data_raw as select a.*, b.calc_date from __input as a left join calc_dates as b on a.eom = b.eom where not missing(b.calc_date) order by a.id, b.calc_date; /* Used for return variables */ create table calc_data_screen as select * from calc_data_raw where not missing(ret_exc) and zero_obs < 10 /* We exclude stock-months with 10 or more zero returns */ group by id, calc_date having count(ret_exc) >= &__min.; quit; %end; * Return Volatility; %if %sysfunc(find(&__stats., rvol)) >= 1 %then %do; proc sql; create table __rvol as select id, calc_date as eom, std(ret_exc) as rvol&sfx. from calc_data_screen group by id, calc_date having count(ret_exc) >= &__min.; quit; %save_or_append(base=op_rvol, new=__rvol); %end; * Maximum Return; %if %sysfunc(find(&__stats., rmax)) >= 1 %then %do; proc rank data= calc_data_screen out = __rmax1 descending; by id calc_date; var ret; ranks ret_rank; run; proc sql; create table __rmax2 as select id, calc_date as eom, mean(ret) as rmax5&sfx., max(ret) as rmax1&sfx. from __rmax1 where ret_rank<=5 group by id, calc_date; quit; %save_or_append(base=op_rmax, new=__rmax2); %end; * Return Skewness; %if %sysfunc(find(&__stats., skew)) >= 1 %then %do; proc means data=calc_data_screen skewness noprint; by id calc_date; var ret_exc; output out = __skew1 skewness = rskew&sfx.; run; proc sql; create table __skew2 as select id, calc_date as eom, rskew&sfx. from __skew1 where _freq_ >= &__min.; quit; %save_or_append(base=op_skew, new=__skew2); %end; * Price-to-high; %if %sysfunc(find(&__stats., prc_to_high)) >= 1 %then %do; proc sql; create table __prc_high as select id, calc_date as eom, prc_adj / max(prc_adj) as prc_highprc&sfx. from calc_data_screen group by id, calc_date having date = max(date) and count(prc_adj) >= &__min.; quit; %save_or_append(base=op_prc_high, new=__prc_high); %end; * Amihud (2002); %if %sysfunc(find(&__stats., ami)) >= 1 %then %do; proc sql; create table __ami as select id, calc_date as eom, mean(abs(ret) / dolvol_d) *1e6 as ami&sfx. from calc_data_screen group by id, calc_date having count(dolvol_d) >= &__min.; quit; %save_or_append(base=op_ami, new=__ami); %end; * CAPM regression (beta + ivol); %if %sysfunc(find(&__stats., capm)) >= 1 and %sysfunc(find(&__stats., capm_ext)) = 0 %then %do; proc reg data=calc_data_screen outest=__capm1 edf NOPRINT; by id calc_date; model ret_exc=mktrf; run; proc sql; create table __capm2 as select id, calc_date as eom, mktrf as beta&sfx., sqrt(_rmse_**2 * _edf_ / (_edf_ + 1)) as ivol_capm&sfx. from __capm1 where (_edf_ + 2) >= &__min.; quit; %save_or_append(base=op_capm, new=__capm2); %end; * CAPM regression extended (beta + ivol + iskew + coskewness); %if %sysfunc(find(&__stats., capm_ext)) >= 1 %then %do; proc reg data=calc_data_screen outest=__capm_ext1 edf NOPRINT; by id calc_date; model ret_exc=mktrf; output out=__capm_ext_res residual=res; /* Including the output statement increases the time by a factor of 3. It's neccesary to compute skewness */ run; proc sql; create table __capm_ext2 as select id, calc_date as eom, mktrf as beta&sfx., sqrt(_rmse_**2 * _edf_ / (_edf_ + 1)) as ivol_capm&sfx. from __capm_ext1 where (_edf_ + 2) >= &__min.; quit; * Idiosyncratic skewness; proc means data=__capm_ext_res skewness noprint; by id calc_date; var res; output out = __capm_ext_skew(where=(_freq_ >= &__min.)) skewness = iskew_capm&sfx.; run; * Coskewness; proc sql; create table __capm_ext_coskew1 as select id, calc_date, res, mktrf - mean(mktrf) as mktrf_dm from __capm_ext_res group by id, calc_date; create table __capm_ext_coskew2 as select id, calc_date, mean(res * mktrf_dm**2) / (sqrt(mean(res**2)) * mean(mktrf_dm**2) ) as coskew&sfx. from __capm_ext_coskew1 group by id, calc_date having count(res) >= &__min.; quit; proc sql; create table __capm_ext3 as select a.*, b.iskew_capm&sfx., c.coskew&sfx. from __capm_ext2 as a left join __capm_ext_skew as b on a.id=b.id and a.eom=b.calc_date left join __capm_ext_coskew2 as c on a.id=c.id and a.eom=c.calc_date; quit; %save_or_append(base=op_capm_ext, new=__capm_ext3); %end; * Fama and French (1993) 3 factor model; %if %sysfunc(find(&__stats., ff3)) >= 1 %then %do; proc reg data=calc_data_screen(where=(not missing(hml) and not missing(smb_ff))) outest=__ff31 edf NOPRINT; by id calc_date; model ret_exc=mktrf smb_ff hml; output out=__ff3_res residual=res; /* Including the output statement increases the time by a factor of 3. It's neccesary to compute skewness */ run; proc sql; create table __ff32 as select id, calc_date as eom, sqrt(_rmse_**2 * _edf_ / (_edf_ + 1)) as ivol_ff3&sfx. from __ff31 where (_edf_ + 4) >= &__min.; quit; * Idiosyncratic skewness; proc means data=__ff3_res skewness noprint; by id calc_date; var res; output out = __ff3_skew(where=(_freq_ >= &__min.)) skewness = iskew_ff3&sfx.; run; proc sql; create table __ff33 as select a.*, b.iskew_ff3&sfx. from __ff32 as a left join __ff3_skew as b on a.id=b.id and a.eom=b.calc_date; quit; %save_or_append(base=op_ff3, new=__ff33); %end; * Hou, Xue and Zhang (2015) 4 factor model; %if %sysfunc(find(&__stats., hxz4)) >= 1 %then %do; proc reg data=calc_data_screen(where=(not missing(roe) and not missing(inv) and not missing(smb_hxz))) outest=__hxz41 edf NOPRINT; by id calc_date; model ret_exc=mktrf smb_hxz roe inv; output out=__hxz4_res residual=res; /* Including the output statement increases the time by a factor of 3. It's neccesary to compute skewness */ run; proc sql; create table __hxz42 as select id, calc_date as eom, sqrt(_rmse_**2 * _edf_ / (_edf_ + 1)) as ivol_hxz4&sfx. from __hxz41 where (_edf_ + 5) >= &__min.; quit; * Idiosyncratic skewness; proc means data=__hxz4_res skewness noprint; by id calc_date; var res; output out = __hxz4_skew(where=(_freq_ >= &__min.)) skewness = iskew_hxz4&sfx.; run; proc sql; create table __hxz43 as select a.*, b.iskew_hxz4&sfx. from __hxz42 as a left join __hxz4_skew as b on a.id=b.id and a.eom=b.calc_date; quit; %save_or_append(base=op_hxz4, new=__hxz43); %end; * Dimson beta; %if %sysfunc(find(&__stats., dimsonbeta)) >= 1 %then %do; proc sql; create table __dimson1 as select a.excntry, a.id, a.date, a.eom, a.ret_exc, a.mktrf, b.mktrf_lg1, b.mktrf_ld1 from calc_data_screen as a left join mkt_lead_lag4 as b on a.excntry = b.excntry and a.date = b.date where not missing(b.mktrf_lg1) and not missing(b.mktrf_ld1); create table __dimson2 as select * from __dimson1 group by id, eom having count(*) >= (&__min. - 1); /* Minus one to reflect the fact that there is one less available observation due to the need of avoiding lookahead bias. */ quit; proc reg data=__dimson2 outest=__dimson3 edf NOPRINT; by id eom; model ret_exc=mktrf mktrf_lg1 mktrf_ld1; run; data __dimson4; set __dimson3; beta_dimson&sfx. = mktrf + mktrf_lg1 + mktrf_ld1; keep id eom beta_dimson&sfx.; run; %save_or_append(base=op_dimson, new=__dimson4); %end; * Downside beta; %if %sysfunc(find(&__stats., downbeta)) >= 1 %then %do; proc reg data=calc_data_screen(where=(mktrf < 0)) outest=__downbeta1 edf NOPRINT; by id calc_date; model ret_exc=mktrf; run; proc sql; create table __downbeta2 as select id, calc_date as eom, mktrf as betadown&sfx. from __downbeta1 where (_edf_ + 2) >= (&__min. / 2); /* Use convention that we require half as many obs for downside beta */ quit; %save_or_append(base=op_downbeta, new=__downbeta2); %end; * Number of zero trades with turnover as tiebreaker; %if %sysfunc(find(&__stats., zero_trades)) >= 1 %then %do; proc sql; create table __zero_trades1 as select id, calc_date as eom, mean(tvol=0) * 21 as zero_trades, mean(tvol / (shares * 1e6)) as turnover from calc_data_raw where not missing(tvol) group by id, calc_date having count(tvol) >= &__min. order by eom; quit; proc rank data=__zero_trades1(where=(not missing(zero_trades) and not missing(turnover))) out = __zero_trades2 descending ties=mean f; /* f means that we use fractional ranks i.e. between 0 and 1*/ by eom; var turnover; ranks rank_turnover; run; proc sql; create table __zero_trades3 as select id, eom, zero_trades + rank_turnover / 100 as zero_trades&sfx. /* Divide by 100 to ensure that turnover only acts as a tie breaker (1/365*21=0.0833)*/ from __zero_trades2; quit; %save_or_append(base=op_zero_trades, new=__zero_trades3); %end; * Turnover; %if %sysfunc(find(&__stats., turnover)) >= 1 %then %do; proc sql; create table __turnover1 as select id, date, calc_date, tvol / (shares * 1e6) as turnover_d from calc_data_raw; create table __turnover2 as select id, calc_date as eom, mean(turnover_d) as turnover&sfx., std(turnover_d) / (calculated turnover&sfx.) as turnover_var&sfx. from __turnover1 group by id, calc_date having count(turnover_d) >= &__min.; quit; %save_or_append(base=op_turnover, new=__turnover2); %end; * Dollar Volume; %if %sysfunc(find(&__stats., dolvol)) >= 1 %then %do; proc sql; create table __dolvol as select id, calc_date as eom, mean(dolvol_d) as dolvol&sfx., std(dolvol_d) / (calculated dolvol&sfx.) as dolvol_var&sfx. from calc_data_raw group by id, calc_date having count(dolvol_d) >= &__min.; quit; %save_or_append(base=op_dolvol, new=__dolvol); %end; * Correlation to Market; %if %sysfunc(find(&__stats., mktcorr)) >= 1 %then %do; proc sql; create table __corr_data1 as select a.*, b.calc_date from corr_data as a left join calc_dates as b on a.eom = b.eom where not missing(b.calc_date) and not missing(ret_exc_3l) and zero_obs < 10 order by a.id, b.calc_date; create table __corr_data2 as select * from __corr_data1 group by id, calc_date having count(ret_exc_3l) >= &__min. and count(mkt_exc_3l) >= &__min.; quit; proc corr data = __corr_data2 outp=__corr1 noprint nomiss ; by id calc_date; var ret_exc_3l mkt_exc_3l; run; proc sql; create table __corr2 as select id, calc_date as eom, ret_exc_3l as corr&sfx. from __corr1 where _type_='CORR' and _name_ = 'mkt_exc_3l'; quit; %save_or_append(base=op_corr, new=__corr2); %end; * Market Volatility (separately for each stock); %if %sysfunc(find(&__stats., mktvol)) >= 1 %then %do; proc sql; create table __mktvol as select id, calc_date as eom, std(mktrf) as __mktvol&sfx. from calc_data_screen group by id, calc_date having count(ret_exc) >= &__min.; quit; %save_or_append(base=op_mktvol, new=__mktvol); %end; * NAME; %if %sysfunc(find(&__stats., NAME)) >= 1 %then %do; %save_or_append(base=, new=); %end; %end; * Make all observations into a dataset by transposing and appending; proc sql noprint; select memname into :op_datasets separated by " " from dictionary.tables where lowcase(libname)="work" and prxmatch("/^op\_/i", memname) > 0; quit; * Initialize dataset to append to; data &out.; format id 9.0 eom YYMMDDN8. stat $char20. value 16.8; stop; run; %do k=1 %to %nwords(&op_datasets.); %let __dt = %scan(&op_datasets., &k., %str(' ')); proc sort data=&__dt.; by id eom; run; proc transpose data=&__dt. out=__op(rename=(col1=value)) name=stat ; by id eom; run; proc append base=&out. data=__op force; run; proc delete data=&__dt.; run; %end; /* Stop timer */ data _null_; dur = datetime() - &__roll_start; put 30*'-' / ' DAILY ROLL APPLY TOOK:' dur time13.2 / 30*'-'; run; %mend; /* MACRO: FINISH DAILY CHARS*/ %macro finish_daily_chars(out=); * Make bidask into a long format; proc transpose data=scratch.corwin_schultz out=bidask(rename=(col1=value)) name=stat; by id eom; run; * Combine all roll chars; data daily_chars1; set scratch.roll_21d; run; proc append base=daily_chars1 data=scratch.roll_126d; run; proc append base=daily_chars1 data=scratch.roll_252d; run; proc append base=daily_chars1 data=scratch.roll_1260d; run; proc append base=daily_chars1 data=bidask force; run; proc sort data=daily_chars1 nodup; by id eom; run; proc transpose data = daily_chars1 out= daily_chars2(drop=_name_); by id eom; id stat; var value; run; proc sql; create table daily_chars3 as select *, corr_1260d * rvol_252d/__mktvol_252d as betabab_1260d, rmax5_21d / rvol_252d as rmax5_rvol_21d from daily_chars2; alter table daily_chars3 drop __mktvol_252d; quit; proc sort data=daily_chars3 out=&out.; by id eom; run; %mend; ================================================ FILE: GlobalFactors/portfolios.R ================================================ library(lubridate) library(tidyverse) library(data.table) # How To -------------------- # Paths # - data_path: Set to path with global characteristics data and if daily_pf==T should also contain a folder with daily stock returns. # - output_path: Set to desired output folder. # - legacy_path: Set to folder if you want to maintain legacy version. If not, set to NULL. # Countries # - countries: Choose the countries where portfolio returns are created. Default: All countries in data_path/Characteristics # Characteristics # - chars: Characteristics to create portfolios from. Can be any column from the global characteristics dataset. # Portfolio Settings # - settings: Choose how to create portfolios. For more information, see description for the portfolios() function # User Input ----------------------- # Paths data_path <- "../../Data" output_path <- "../../PaperFactors" legacy_path <- "../../Legacy" # Countries countries <- list.files(path = paste0(data_path, "/Characteristics")) %>% str_remove(".csv") # Chars chars <- c( "age", "aliq_at", "aliq_mat", "ami_126d", "at_be", "at_gr1", "at_me", "at_turnover", "be_gr1a", "be_me", "beta_60m", "beta_dimson_21d", "betabab_1260d", "betadown_252d", "bev_mev", "bidaskhl_21d", "capex_abn", "capx_gr1", "capx_gr2", "capx_gr3", "cash_at", "chcsho_12m", "coa_gr1a", "col_gr1a", "cop_at", "cop_atl1", "corr_1260d", "coskew_21d", "cowc_gr1a", "dbnetis_at", "debt_gr3", "debt_me", "dgp_dsale", "div12m_me", "dolvol_126d", "dolvol_var_126d", "dsale_dinv", "dsale_drec", "dsale_dsga", "earnings_variability", "ebit_bev", "ebit_sale", "ebitda_mev", "emp_gr1", "eq_dur", "eqnetis_at", "eqnpo_12m", "eqnpo_me", "eqpo_me", "f_score", "fcf_me", "fnl_gr1a", "gp_at", "gp_atl1", "ival_me", "inv_gr1", "inv_gr1a", "iskew_capm_21d", "iskew_ff3_21d", "iskew_hxz4_21d", "ivol_capm_21d", "ivol_capm_252d", "ivol_ff3_21d", "ivol_hxz4_21d", "kz_index", "lnoa_gr1a", "lti_gr1a", "market_equity", "mispricing_mgmt", "mispricing_perf", "ncoa_gr1a", "ncol_gr1a", "netdebt_me", "netis_at", "nfna_gr1a", "ni_ar1", "ni_be", "ni_inc8q", "ni_ivol", "ni_me", "niq_at", "niq_at_chg1", "niq_be", "niq_be_chg1", "niq_su", "nncoa_gr1a", "noa_at", "noa_gr1a", "o_score", "oaccruals_at", "oaccruals_ni", "ocf_at", "ocf_at_chg1", "ocf_me", "ocfq_saleq_std", "op_at", "op_atl1", "ope_be", "ope_bel1", "opex_at", "pi_nix", "ppeinv_gr1a", "prc", "prc_highprc_252d", "qmj", "qmj_growth", "qmj_prof", "qmj_safety", "rd_me", "rd_sale", "rd5_at", "resff3_12_1", "resff3_6_1", "ret_1_0", "ret_12_1", "ret_12_7", "ret_3_1", "ret_6_1", "ret_60_12", "ret_9_1", "rmax1_21d", "rmax5_21d", "rmax5_rvol_21d", "rskew_21d", "rvol_21d", "sale_bev", "sale_emp_gr1", "sale_gr1", "sale_gr3", "sale_me", "saleq_gr1", "saleq_su", "seas_1_1an", "seas_1_1na", "seas_11_15an", "seas_11_15na", "seas_16_20an", "seas_16_20na", "seas_2_5an", "seas_2_5na", "seas_6_10an", "seas_6_10na", "sti_gr1a", "taccruals_at", "taccruals_ni", "tangibility", "tax_gr1a", "turnover_126d", "turnover_var_126d", "z_score", "zero_trades_126d", "zero_trades_21d", "zero_trades_252d" ) # Portfolio settings settings <- list( end_date = as.Date("2023-12-31"), pfs = 3, source = c("CRSP", "COMPUSTAT"), wins_ret = T, bps = "non_mc", bp_min_n = 10, cmp = list( us = T, int = F ), signals = list( us = F, int = F, standardize = T, weight = "vw_cap" ), regional_pfs = list( ret_type = "vw_cap", # Type of return to use for regional factors country_excl = c("ZWE", "VEN"), # Countries are excluded due to data issues country_weights = "market_cap", # How to weight countries? In ("market_cap", "stocks", "ew") stocks_min = 5, # Minimum amount of stocks in each side of the portfolios months_min = 5 * 12, # Minimum amount of observations a factor needs to be included countries_min = 3 # Minimum number of countries necessary in a regional portfolio ), daily_pf = T, ind_pf = T ) # Portfolio Function ------------- portfolios <- function( data_path, excntry, chars, source = c("CRSP", "COMPUSTAT"), # Use data from "CRSP", "Compustat" or both: c("CRSP", "COMPUSTAT") wins_ret = T, # Should Compustat returns be winsorized at the 0.1% and 99.9% of CRSP returns? pfs, # Number of portfolios bps, # What should breakpoints be based on? Non-Microcap stocks ("non_mc") or NYSE stocks "nyse" bp_min_n, # Minimum number of stocks used for breakpoints cmp = F, # Create characteristics managed size portfolios? signals = F, # Create portfolio signals? signals_standardize = F, # Map chars to [-0.5, +0.5]?, signals_w = "vw_cap", # Weighting for signals: in c("ew", "vw", "vw_cap") nyse_size_cutoffs, # Data frame with NYSE size breakpoints daily_pf= F, # Should daily return be estimated ind_pf = F, # Should industry portfolio returns be estimated ret_cutoffs = NULL, # Data frame for monthly winsorization. Neccesary when wins_ret=T ret_cutoffs_daily = NULL # Data frame for daily winsorization. Neccesary when wins_ret=T and daily_pf=T ) { # Characteristic Data data <- fread(paste0(data_path, "/Characteristics/", excntry, ".csv"), select = c("id", "eom", "source_crsp", "comp_exchg", "crsp_exchcd", "size_grp", "ret_exc", "ret_exc_lead1m", "me", "gics", "ff49", chars), colClasses = c("eom"="character")) data[, eom := eom %>% lubridate::fast_strptime(format = "%Y%m%d") %>% as.Date()] # ME CAP data <- nyse_size_cutoffs[, .(eom, nyse_p80)][data, on = "eom"] data[, me_cap := pmin(me, nyse_p80)][, nyse_p80 := NULL] # Screens if (length(source) == 1) { if (source == "CRSP") { data <- data[source_crsp == 1] } if (source == "COMPUSTAT") { data <- data[source_crsp == 0] } } data <- data[!is.na(size_grp) & !is.na(me) & !is.na(ret_exc_lead1m)] # The ret_exc_lead1m screen assumes that investor knew at the beginning of the month that the security would delist. # Daily Returns if (daily_pf) { daily <- fread(paste0(data_path, "/Daily Returns/", excntry, ".csv"), colClasses = c("date"="character"), select = c("id", "date", "ret_exc")); gc() daily[, date := date %>% lubridate::fast_strptime(format = "%Y%m%d") %>% as.Date()] daily[, eom_lag1 := floor_date(date, unit="month")-1] } # Winsorize Returns? if (wins_ret) { data <- ret_cutoffs[, .("eom" = eom_lag1, "p001"=ret_exc_0_1, "p999"=ret_exc_99_9)][data, on = "eom"] data[source_crsp == 0 & ret_exc_lead1m > p999, ret_exc_lead1m := p999] data[source_crsp == 0 & ret_exc_lead1m < p001, ret_exc_lead1m := p001] data[, c("source_crsp", "p001", "p999") := NULL] if (daily_pf) { daily[, year := year(date)] daily[, month := month(date)] daily <- ret_cutoffs_daily[, .(year, month, "p001"=ret_exc_0_1, "p999"=ret_exc_99_9)][daily, on = .(year, month)] daily[id>99999 & ret_exc > p999, ret_exc := p999] # Only winsorize Compustat data, id for CRSP is 5 digits, Compustat is 9: source_crsp == 0 daily[id>99999 & ret_exc < p001, ret_exc := p001] daily[, c("p001", "p999", "year", "month") := NULL] } } # Standardize to [-0.5, +0.5] interval (for signals) if (signals_standardize & signals) { data[, (chars) := lapply(.SD, function(x) frank(x, ties.method = "min", na.last = "keep")), .SDcols = chars, by = eom] data[, (chars) := lapply(.SD, as.numeric), .SDcols = chars] data[, (chars) := lapply(.SD, function(x) x / max(x, na.rm=T) - 0.5), .SDcols = chars, by = eom] } # Industry Portfolios if (ind_pf) { ind_data <- data[!is.na(gics), .(eom, gics, excntry, ret_exc_lead1m, me, me_cap)] # Get first 2 digits of GICS code for industry groups ind_data[, gics := as.numeric(substr(ind_data$gics, 1, 2))] ind_gics <- ind_data[, .( n = .N, ret_ew = mean(ret_exc_lead1m), ret_vw = sum(ret_exc_lead1m * me) / sum(me), ret_vw_cap = sum(ret_exc_lead1m * me_cap) / sum(me_cap) ), by = .(gics, eom)][, excntry := str_to_upper(excntry)] # Lead month to match using leaded returns ind_gics[, eom := ceiling_date(eom+1, unit = "month")-1] ind_gics <- ind_gics[n >= bp_min_n] # Estimate industry portfolios by Fama-French portfolios for US data if (excntry == "usa"){ ind_data <- data[!is.na(ff49), .(eom, ff49, ret_exc_lead1m, me, me_cap)] ind_ff49 <- ind_data[, .( n = .N, ret_ew = mean(ret_exc_lead1m), ret_vw = sum(ret_exc_lead1m * me) / sum(me), ret_vw_cap = sum(ret_exc_lead1m * me_cap) / sum(me_cap) ), by = .(ff49, eom)][, excntry := str_to_upper(excntry)] ind_ff49[, eom := ceiling_date(eom+1, unit = "month")-1] ind_ff49 <- ind_ff49[n >= bp_min_n] } } # Prepare output list output <- list() # Apply Portfolio Function to Each Characteristic char_pfs <- chars %>% lapply(function(x) { op <- list() print(paste0(" " , x, ": ", match(x, chars), " out of ", length(chars))) data[, var := as.double(get(x))] # Unless we need to compute signals, limit size of data if(!signals) { sub <- data[!is.na(var), .(id, eom, var, size_grp, ret_exc_lead1m, me, me_cap, crsp_exchcd, comp_exchg)] } else { sub <- data[!is.na(var)] } # Portfolio Assignment if (bps == "nyse") { sub[, bp_stock := (crsp_exchcd == 1 & is.na(comp_exchg)) | (comp_exchg == 11 & is.na(crsp_exchcd))] } if (bps == "non_mc") { sub[, bp_stock := (size_grp %in% c("mega", "large", "small"))] } sub[, bp_n := sum(bp_stock), by = eom] sub <- sub[bp_n >= bp_min_n] # require at least 10 stocks for break points if (nrow(sub) != 0) { sub[, cdf := ecdf(var[bp_stock == T])(var), by = eom] sub[, min_cdf := min(cdf), by = eom] sub[cdf == min_cdf, cdf := 0.00000001] # To ensure that the lowest value is in portfolio 1 sub[, pf := ceiling(cdf*pfs), by = eom] sub[pf == 0, pf := 1] # Happens when non-bp stocks extend beyond bp stock range # Returns op$pf_returns <- sub[, .( characteristic = x, n = .N, signal = median(var), ret_ew = mean(ret_exc_lead1m), ret_vw = sum(ret_exc_lead1m * me) / sum(me), ret_vw_cap = sum(ret_exc_lead1m * me_cap) / sum(me_cap) ), by = .(pf, eom)] op$pf_returns[, eom := ceiling_date(eom+1, unit = "month")-1] # Reflect the fact that returns are leaded # Signals if (signals) { if (signals_w == "ew") { sub[, w := 1/.N, by = .(pf, eom)] } if (signals_w == "vw") { sub[, w := me / sum(me), .(pf, eom)] } if (signals_w == "vw_cap") { sub[, w := me_cap / sum(me_cap), .(pf, eom)] } sub[, (chars) := lapply(.SD, function(x) if_else(is.na(x), 0, x)), .SDcols = chars] # Set missing to median of 0 pf_signals <- sub[, lapply(.SD, function(x) sum(w * x)), .SDcols = chars, by = .(pf, eom)] pf_signals[, characteristic := x] pf_signals[, eom := ceiling_date(eom+1, unit = "month")-1] # Reflect the fact that returns are leaded op$signals <- pf_signals } # Daily Portfolios if (daily_pf) { # Keep weights constant throughout month weights <- sub[, .(id, w_ew = 1/.N, w_vw = me/sum(me), w_vw_cap = me_cap/sum(me_cap)), by = .(eom, pf)] daily_sub <- weights[daily, on = .(id, eom=eom_lag1)][!is.na(pf) & !is.na(ret_exc)] op$pf_daily <- daily_sub[, .( n = .N, ret_ew = sum(w_ew*ret_exc), ret_vw = sum(w_vw*ret_exc), ret_vw_cap = sum(w_vw_cap*ret_exc) ), by = .(pf, date)][, characteristic := x] } # Output return(op) } }) output$pf_returns <- char_pfs %>% lapply(function(x) x$pf_returns) %>% rbindlist() if (daily_pf) { output$pf_daily <- char_pfs %>% lapply(function(x) x$pf_daily) %>% rbindlist() } if (ind_pf) { output$gics_returns <- ind_gics if (excntry == "usa") { output$ff49_returns <- copy(ind_ff49) } } if (nrow(output$pf_returns) != 0) { output$pf_returns[, excntry := str_to_upper(excntry)] if (daily_pf) { output$pf_daily[, excntry := str_to_upper(unique(output$pf_returns[, excntry]))] } if (signals) { output$signals <- char_pfs %>% lapply(function(x) x$signals) %>% rbindlist() output$signals[, excntry := str_to_upper(excntry)] } } # Characteristic Managed Portfolios if (cmp) { output$cmp <- chars %>% lapply(function(x) { print(paste0(" CMP - " , x, ": ", match(x, chars), " out of ", length(chars))) data[, var := get(x)] sub <- data[!is.na(var), .(eom, var, size_grp, ret_exc_lead1m)] sub[, p_rank := frank(var, na.last=NA, ties.method = "average") / (.N + 1), by = .(size_grp, eom)] # Notice tie method sub[, p_rank_dev := p_rank - mean(p_rank), by = .(size_grp, eom)] sub[, weight := p_rank_dev / (sum(abs(p_rank_dev)) / 2), by = .(size_grp, eom)] # 1 unit invested in each of the long and short pf cmp <- sub[, .( excntry = excntry, characteristic = x, n_stocks = .N, ret_weighted = sum(ret_exc_lead1m * weight), signal_weighted = sum(var * weight), sd_var = sd(var) ), by = .(size_grp, eom)] cmp <- cmp[sd_var != 0][, sd_var := NULL] cmp[, eom := ceiling_date(eom+1, unit = "month")-1] # Reflect the fact that returns are leaded return(cmp) }) %>% rbindlist() output$cmp[, excntry := str_to_upper(excntry)] } # Output return(output) } # Extract Neccesary Information -------------------- # Factor Details char_info <- readxl::read_xlsx("Factor Details.xlsx", sheet = "details", range = "A1:N300") %>% select("characteristic"=abr_jkp, direction) %>% filter(!is.na(characteristic)) %>% mutate(direction = direction %>% as.integer) %>% setDT() # Country Classification country_classification <- readxl::read_xlsx("Country Classification.xlsx", sheet = "countries", range = "A1:I200") %>% select(excntry, msci_development, region) %>% filter(!is.na(excntry) & !(excntry %in% settings$regional_pfs$country_excl)) %>% setDT() regions <- tibble( name = c("developed", "emerging", "frontier", "world", "world_ex_us"), country_codes = list( country_classification[msci_development == "developed" & excntry != "USA"]$excntry, country_classification[msci_development == "emerging"]$excntry, country_classification[msci_development == "frontier"]$excntry, country_classification$excntry, country_classification[excntry != "USA"]$excntry ), countries_min = c(rep(settings$regional_pfs$countries_min, 3), 1, 3) ) # Cluster Labels cluster_labels <- fread("Cluster Labels.csv") # NYSE Cutoff nyse_size_cutoffs <- fread(paste0(data_path, "/nyse_cutoffs.csv"), colClasses = c("eom"="character")) nyse_size_cutoffs[, eom := as.Date(eom, format = "%Y%m%d")] # CRSP Return Cutoffs ret_cutoffs <- fread(paste0(data_path, "/return_cutoffs.csv"), colClasses = c("eom"="character")) ret_cutoffs[, eom := as.Date(eom, format = "%Y%m%d")] ret_cutoffs[, eom_lag1 := floor_date(eom, unit = "month") - 1] # Because we use ret_exc_lead1m if (settings$daily_pf) { ret_cutoffs_daily <- fread(paste0(data_path, "/return_cutoffs_daily.csv")) } # Market market <- fread(paste0(data_path, "/market_returns.csv"), colClasses = c("eom"="character")) market[, eom := eom %>% as.Date("%Y%m%d")] if (settings$daily_pf) { market_daily <- fread(paste0(data_path, "/market_returns_daily.csv"), colClasses = c("date"="character")) market_daily[, date := date %>% as.Date("%Y%m%d")] } # Create Portfolios ----------------------- portfolio_data <- countries %>% lapply(function(ex) { print(paste0(ex, ": ", match(ex, countries), " out of ", length(countries))) portfolios( data_path = data_path, excntry = ex, chars = chars, source = settings$source, wins_ret = settings$wins_ret, pfs=settings$pfs, bps=settings$bps, bp_min_n=settings$bp_min_n, cmp = if_else(ex == "usa", settings$cmp$us, settings$cmp$int), signals=if_else(ex == "usa", settings$signals$us, settings$signals$int), signals_standardize=settings$signals$standardize, signals_w=settings$signals$weight, nyse_size_cutoffs = nyse_size_cutoffs, daily_pf = settings$daily_pf, ind_pf = settings$ind_pf, ret_cutoffs = ret_cutoffs, ret_cutoffs_daily = ret_cutoffs_daily ) }) # Daily Data if (settings$daily_pf) { # Daily Portfolio Returns pf_daily <- portfolio_data %>% lapply(function(x) x$pf_daily) %>% rbindlist() pf_daily %>% setorder(excntry, characteristic, pf, date) # Daily Long-Short Factors hml_daily <- pf_daily[, .( pfs = sum(pf == settings$pfs) + sum(pf == 1), n_stocks = n[pf==settings$pfs] + n[pf==1], n_stocks_min = as.integer(min(n[pf==settings$pfs], n[pf==1])), ret_ew = ret_ew[pf==settings$pfs] - ret_ew[pf==1], ret_vw = ret_vw[pf==settings$pfs] - ret_vw[pf==1], ret_vw_cap = ret_vw_cap[pf==settings$pfs] - ret_vw_cap[pf==1] ), .(excntry, characteristic, date)] hml_daily <- hml_daily[pfs == 2][, pfs := NULL] hml_daily %>% setorder(excntry, characteristic, date) lms_daily <- char_info[hml_daily, on = "characteristic"] resign_cols <- c("ret_ew", "ret_vw", "ret_vw_cap") lms_daily[, (resign_cols) := lapply(.SD, function(x) x*direction), .SDcols=resign_cols] } # Monthly Portfolio Returns pf_returns <- portfolio_data %>% lapply(function(x) x$pf_returns) %>% rbindlist() pf_returns <- pf_returns %>% select(excntry, characteristic, pf, eom, n, signal, ret_ew, ret_vw, ret_vw_cap) pf_returns %>% setorder(excntry, characteristic, pf, eom) # GICS Returns if (settings$ind_pf) { gics_returns <- portfolio_data %>% lapply(function(x) x$gics_returns) %>% rbindlist() gics_returns %>% setorder(excntry, gics, eom) ff49_returns <- portfolio_data[[which(countries == "usa")]]$ff49_returns ff49_returns %>% setorder(excntry, ff49, eom) } # Create HML Returns hml_returns <- pf_returns[, .( pfs = sum(pf == settings$pfs) + sum(pf == 1), signal = signal[pf==settings$pfs] - signal[pf==1], n_stocks = n[pf==settings$pfs] + n[pf==1], n_stocks_min = min(n[pf==settings$pfs], n[pf==1]), ret_ew = ret_ew[pf==settings$pfs] - ret_ew[pf==1], ret_vw = ret_vw[pf==settings$pfs] - ret_vw[pf==1], ret_vw_cap = ret_vw_cap[pf==settings$pfs] - ret_vw_cap[pf==1] ), .(excntry, characteristic, eom)] hml_returns <- hml_returns[pfs == 2][, pfs := NULL] hml_returns %>% setorder(excntry, characteristic, eom) # Create Long-Short Factors [Sign Returns to be consistent with original paper] lms_returns <- char_info[hml_returns, on = "characteristic"] resign_cols <- c("signal", "ret_ew", "ret_vw", "ret_vw_cap") lms_returns[, (resign_cols) := lapply(.SD, function(x) x*direction), .SDcols=resign_cols] # Extract Signals (TBD) # Extract CMP returns cmp_returns <- portfolio_data %>% lapply(function(x) x$cmp) %>% rbindlist() cmp_returns <- cmp_returns %>% select(excntry, characteristic, size_grp, eom, n_stocks, signal_weighted, ret_weighted) # Cluster portfolios --------------- cluster_pfs <- cluster_labels[lms_returns, on = .(characteristic)][, .( n_factors = .N, ret_ew = mean(ret_ew), ret_vw = mean(ret_vw), ret_vw_cap = mean(ret_vw_cap) ), by = .(excntry, cluster, eom)] if (settings$daily_pf) { cluster_pfs_daily <- cluster_labels[lms_daily, on = .(characteristic)][, .( n_factors = .N, ret_ew = mean(ret_ew), ret_vw = mean(ret_vw), ret_vw_cap = mean(ret_vw_cap) ), by = .(excntry, cluster, date)] } # Regional Portfolios ------------------------------------------------ regional_data <- function(data, mkt, date_col, char_col, countries, weighting, countries_min, periods_min, stocks_min) { # Determine Country Weights weights <- mkt[, .(excntry, get(date_col), mkt_vw_exc, "country_weight" = case_when( weighting == "market_cap" ~ me_lag1, weighting == "stocks" ~ as.double(stocks), weighting == "ew" ~ 1) )] weights %>% setnames(old="V2", new="date_col") # Portfolio Return pf <- data[excntry %in% countries & n_stocks_min >= stocks_min] pf %>% setnames(old=c(date_col, char_col), new = c("date_col", "char_col")) pf <- weights[pf, on = .(excntry, date_col)] pf <- pf[!is.na(mkt_vw_exc), .( n_countries = .N, direction = unique(direction), ret_ew = sum(ret_ew*country_weight) / sum(country_weight), ret_vw = sum(ret_vw*country_weight) / sum(country_weight), ret_vw_cap = sum(ret_vw_cap*country_weight) / sum(country_weight), mkt_vw_exc = sum(mkt_vw_exc * country_weight) / sum(country_weight) ), by = .(char_col, date_col)] # Minimum Requirement: Countries pf <- pf[n_countries >= countries_min] # Minimum Requirement: Months pf[, periods := .N, by = .(char_col)] pf <- pf[periods >= periods_min][, periods := NULL] pf %>% setorder(char_col, date_col) pf %>% setnames(old = c("date_col", "char_col"), new = c(date_col, char_col)) return(pf) } # Regional Factors regional_pfs <- 1:nrow(regions) %>% lapply(function(i) { info <- regions[i, ] reg_pf <- lms_returns %>% regional_data(mkt=market, countries = unlist(info$country_codes), date_col = "eom", char_col = "characteristic", weighting = settings$regional_pfs$country_weights, countries_min = info$countries_min, periods_min = settings$regional_pfs$months_min, stocks_min = settings$regional_pfs$stocks_min) reg_pf %>% mutate(region = info$name) %>% select(region, characteristic, direction, eom, n_countries, ret_ew, ret_vw, ret_vw_cap, mkt_vw_exc) }) %>% rbindlist() if (settings$daily_pf) { regional_pfs_daily <- 1:nrow(regions) %>% lapply(function(i) { info <- regions[i, ] reg_pf <- lms_daily %>% regional_data(mkt=market_daily, countries = unlist(info$country_codes), date_col = "date", char_col = "characteristic", weighting = settings$regional_pfs$country_weights, countries_min = info$countries_min, periods_min = settings$regional_pfs$months_min*21, stocks_min = settings$regional_pfs$stocks_min) reg_pf %>% mutate(region = info$name) %>% select(region, characteristic, direction, date, n_countries, ret_ew, ret_vw, ret_vw_cap, mkt_vw_exc) }) %>% rbindlist() } # Regional Cluster Portfolios regional_clusters <- 1:nrow(regions) %>% lapply(function(i) { info <- regions[i, ] reg_pf <- cluster_pfs %>% rename("n_stocks_min"=n_factors) %>% mutate(direction = NA_real_) %>% # Hack to make the function applicable regional_data(mkt=market, countries = unlist(info$country_codes), date_col = "eom", char_col = "cluster", weighting = settings$regional_pfs$country_weights, countries_min = info$countries_min, periods_min = settings$regional_pfs$months_min, stocks_min = 1) reg_pf %>% mutate(region = info$name) %>% select(region, cluster, eom, n_countries, ret_ew, ret_vw, ret_vw_cap, mkt_vw_exc) }) %>% rbindlist() if (settings$daily_pf) { regional_clusters_daily <- 1:nrow(regions) %>% lapply(function(i) { info <- regions[i, ] reg_pf <- cluster_pfs_daily %>% rename("n_stocks_min"=n_factors) %>% mutate(direction = NA_real_) %>% # Hack to make the function applicable regional_data(mkt=market_daily, countries = unlist(info$country_codes), date_col = "date", char_col = "cluster", weighting = settings$regional_pfs$country_weights, countries_min = info$countries_min, periods_min = settings$regional_pfs$months_min*21, stocks_min = 1) reg_pf %>% mutate(region = info$name) %>% select(region, cluster, date, n_countries, ret_ew, ret_vw, ret_vw_cap, mkt_vw_exc) }) %>% rbindlist() } # Save ---------------- if(!is.null(legacy_path)) { # Save Time Stamped Files folder <- paste0(legacy_path, "/Past Portfolios/", Sys.Date()) dir.create(folder) settings %>% saveRDS(file = paste0(folder, "/settings.RDS")) market[eom <= settings$end_date] %>% fwrite(file = paste0(folder, "/market_returns.csv")) market_daily[date <= settings$end_date] %>% fwrite(file = paste0(folder, "/market_returns_daily.csv")) hml_returns[eom <= settings$end_date] %>% fwrite(file = paste0(folder, "/hml.csv")) cmp_returns[eom <= settings$end_date] %>% fwrite(file = paste0(folder, "/cmp.csv")) if (settings$daily_pf) { lms_daily[date <= settings$end_date] %>% fwrite(file = paste0(folder, "/lms_daily.csv")) } if (settings$ind_pf) { gics_returns[eom <= settings$end_date] %>% fwrite(file = paste0(folder, "/industry_gics.csv")) if (nrow(ff49_returns) != 0) { ff49_returns[eom <= settings$end_date] %>% fwrite(file = paste0(folder, "/industry_ff49.csv")) } } } # Save Most Recent Files market[eom <= settings$end_date] %>% fwrite(file = paste0(output_path, "/market_returns.csv")) pf_returns[eom <= settings$end_date] %>% fwrite(file = paste0(output_path, "/pfs.csv")) hml_returns[eom <= settings$end_date] %>% fwrite(file = paste0(output_path, "/hml.csv")) lms_returns[eom <= settings$end_date] %>% fwrite(file = paste0(output_path, "/lms.csv")) cmp_returns[eom <= settings$end_date] %>% fwrite(file = paste0(output_path, "/cmp.csv")) cluster_pfs[eom <= settings$end_date] %>% fwrite(file = paste0(output_path, "/clusters.csv")) if (settings$daily_pf) { market_daily[date <= settings$end_date] %>% fwrite(file = paste0(output_path, "/market_returns_daily.csv")) pf_daily[date <= settings$end_date] %>% fwrite(file = paste0(output_path, "/pfs_daily.csv")) hml_daily[date <= settings$end_date] %>% fwrite(file = paste0(output_path, "/hml_daily.csv")) lms_daily[date <= settings$end_date] %>% fwrite(file = paste0(output_path, "/lms_daily.csv")) cluster_pfs_daily[date <= settings$end_date] %>% fwrite(file = paste0(output_path, "/clusters_daily.csv")) } if (settings$ind_pf) { gics_returns[eom <= settings$end_date] %>% fwrite(file = paste0(output_path, "/industry_gics.csv")) if (nrow(ff49_returns) != 0) { ff49_returns[eom <= settings$end_date] %>% fwrite(file = paste0(output_path, "/industry_ff49.csv")) } } # Regional Factors reg_folder <- paste0(output_path, "/Regional Factors") if (!dir.exists(reg_folder)) { dir.create(reg_folder) } for (reg in unique(regional_pfs$region)) { regional_pfs[eom <= settings$end_date & region %in% reg] %>% fwrite(file = paste0(reg_folder, "/", str_to_sentence(reg), ".csv")) } if (settings$daily_pf) { reg_folder_daily <- paste0(output_path, "/Regional Factors Daily") if (!dir.exists(reg_folder_daily)) { dir.create(reg_folder_daily) } for (reg in unique(regional_pfs_daily$region)) { regional_pfs_daily[date <= settings$end_date & region %in% reg] %>% fwrite(file = paste0(reg_folder_daily, "/", str_to_sentence(reg), ".csv")) } } # Regional Clusters reg_folder <- paste0(output_path, "/Regional Clusters") if (!dir.exists(reg_folder)) { dir.create(reg_folder) } for (reg in unique(regional_clusters$region)) { regional_clusters[eom <= settings$end_date & region %in% reg] %>% fwrite(file = paste0(reg_folder, "/", str_to_sentence(reg), ".csv")) } if (settings$daily_pf) { reg_folder_daily <- paste0(output_path, "/Regional Clusters Daily") if (!dir.exists(reg_folder_daily)) { dir.create(reg_folder_daily) } for (reg in unique(regional_clusters_daily$region)) { regional_clusters_daily[date <= settings$end_date & region %in% reg] %>% fwrite(file = paste0(reg_folder_daily, "/", str_to_sentence(reg), ".csv")) } } # Save Long/Short Factors by Country cnt_folder <- paste0(output_path, "/Country Factors") if (!dir.exists(cnt_folder)) { dir.create(cnt_folder) } for (exc in unique(lms_returns$excntry)) { lms_returns[eom <= settings$end_date & excntry==exc] %>% fwrite(file = paste0(cnt_folder, "/", exc, ".csv")) } if (settings$daily_pf) { cnt_folder_daily <- paste0(output_path, "/Country Factors Daily") if (!dir.exists(cnt_folder_daily)) { dir.create(cnt_folder_daily) } for (exc in unique(lms_daily$excntry)) { lms_daily[date <= settings$end_date & excntry==exc] %>% fwrite(file = paste0(cnt_folder_daily, "/", exc, ".csv")) } } # Save supplementary information nyse_size_cutoffs %>% fwrite(file = paste0(output_path, "/nyse_cutoffs.csv")) ret_cutoffs %>% fwrite(file = paste0(output_path, "/return_cutoffs.csv")) if (settings$daily_pf) { ret_cutoffs_daily %>% fwrite(file = paste0(output_path, "/return_cutoffs_daily.csv")) } ================================================ FILE: GlobalFactors/project_macros.sas ================================================ ************************************************************* * HELPER MACROS ************************************************************ ; * Winsorize_own: Flexible version of WRDS %winsorize macro Only difference currently, is the explicit specification of perc_low and perc_high. This allows for winsorizing in each end and also to specify winsorization below 1% ; %macro winsorize_own(inset=, outset=, sortvar=, vars=, perc_low=1, perc_high=99, trim=0); /* List of all variables */ %let vars = %sysfunc(compbl(&vars)); %let nvars = %nwords(&vars); /* Display Output */ %put ### START.; /* Trimming / Winsorization Options */ %if &trim=0 %then %put ### Winsorization; %else %put ### Trimming; %put ### Number of Variables: &nvars; %put ### List of Variables: &vars; options nonotes; /* Ranking within &sortvar levels */ %put ### Sorting... ; proc sort data=&inset; by &sortvar; run; /* 2-tail winsorization/trimming */ %let var2 = %sysfunc(tranwrd(&vars,%str( ),%str(__ )))__; %let var_p1 = %sysfunc(tranwrd(&vars,%str( ),%str(__&perc_low )))__&perc_low ; %let var_p2 = %sysfunc(tranwrd(&vars,%str( ),%str(__&perc_high )))__&perc_high ; /* Theis: Handle naming if winsorization < 1%. In this case, digits cause problems */ %let var_p1 = %sysfunc(prxchange(s/\./_/, -1, &var_p1.)); /* Replace . with _ */ %let var_p2 = %sysfunc(prxchange(s/\./_/, -1, &var_p2.)); /* Replace . with _ */ /* Calculate upper and lower percentiles */ proc univariate data=&inset noprint; by &sortvar; var &vars; output out=_perc pctlpts=&perc_low &perc_high pctlpre=&var2; run; %if &trim=1 %then %let condition = %str(if myvars(i)>=perct2(i) or myvars(i)<=perct1(i) then myvars(i)=. ); %else %let condition = %str(myvars(i)=min(perct2(i),max(perct1(i),myvars(i))) ); %if &trim=0 %then %put ### Winsorizing at &perc_low.% and &perc_high.%... ; %else %put ### Trimming at &perc_low.% and &perc_high.%... ; /* Save output with trimmed/winsorized variables */ data &outset; merge &inset (in=a) _perc; by &sortvar; if a; array myvars {&nvars} &vars; array perct1 {&nvars} &var_p1; array perct2 {&nvars} &var_p2; do i = 1 to &nvars; if not missing(myvars(i)) then do; &condition; end; end; drop i &var_p1 &var_p2; run; /* House Cleaning */ proc sql; drop table _perc; quit; options notes; %put ### DONE . ; %put ; %mend winsorize_own; /* MACRO: RETURN CUTOFFS The output of the macro is the 0.1%, 1%, 99% and 99.9% percentile of excess returns which can later be used for winsorization The reason why this procedure is necesary is that we output the country country-by-country, which makes across country winsorization difficult. */ %macro return_cutoffs(data=, freq=, out=, crsp_only=); %if &freq.=m %then %do; %let date_var = eom; %let by_vars = eom; %end; %if &freq.=d %then %do; %let date_var = date; %let by_vars = year month; %end; %if &crsp_only.=1 %then %do; proc sort data=&data.(where=(source_crsp=1 and common=1 and obs_main=1 and exch_main=1 and primary_sec=1 and excntry ^= 'ZWE' and not missing(ret_exc))) out=base; by &date_var.; run; %end; %if &crsp_only.=0 %then %do; proc sort data=&data.(where=(common=1 and obs_main=1 and exch_main=1 and primary_sec=1 and excntry ^= 'ZWE' and not missing(ret_exc))) out=base; by &date_var.; run; %end; %if &freq.=d %then %do; data base; set base; year=year(date); month=month(date); run; %end; %let ret_types = ret ret_local ret_exc; %do i=1 %to %sysfunc(countw(&ret_types.)); %let ret_type = %scan(&ret_types., &i.); proc univariate data=base noprint; by &by_vars.; var &ret_type.; output out=cutoffs n=n pctlpts=0.1 1 99 99.9 pctlpre=&ret_type._; run; %if &i.=1 %then %do; data &out.; set cutoffs; run; %end; %else %do; %if &freq.=m %then %do; proc sql; create table &out. as select a.*, b.&ret_type._0_1, b.&ret_type._1, b.&ret_type._99, b.&ret_type._99_9 from &out. as a left join cutoffs as b on a.eom=b.eom; quit; %end; %if &freq.=d %then %do; proc sql; create table &out. as select a.*, b.&ret_type._0_1, b.&ret_type._1, b.&ret_type._99, b.&ret_type._99_9 from &out. as a left join cutoffs as b on a.year=b.year and a.month=b.month; quit; %end; %end; %end; proc delete data= cutoffs base; run; %mend; /* MACRO: NYSE SIZE CUTOFFS Used for determining size groups and me cap weights */ %macro nyse_size_cutoffs(data=, out=); proc sort data=&data.(where=(crsp_exchcd=1 and obs_main = 1 and exch_main = 1 and primary_sec = 1 and common = 1 and not missing(me))) out=nyse_stocks; by eom; run; proc means data=nyse_stocks noprint; by eom; var me; output out=&out.(drop=_type_ _freq_) N=n P1=nyse_p1 p20 = nyse_p20 P50=nyse_p50 p80 = nyse_p80; run; proc delete data=nyse_stocks; run; %mend; /* Flexible version of WRDS populate function which can also do daily frequency */ %macro populate_own(inset=, outset=, datevar=, idvar=, datename=, forward_max=, period=); /* Period in ('day', 'month') */ /* Start Macro*/ %put ; %put ### START. Populating Data --> Note that duplicate idvar and datevar will be removed; /* nodupkey sort necessary */ proc sort data=&inset. out=__temp nodupkey; by &idvar descending &datevar.; run; options nonotes; /* Populate Dates */ /* FORWARD_MAX is the Regular Periodicity or the Forward Population Intervals */ %let nid = %nwords(&idvar.); %let id2 = %scan(&idvar.,&nid.,%str( )); data &outset. ; format &datename. YYMMDDN8.; set __temp; by &idvar.; &datename. =&datevar.; output; following = lag(&datename.); forward_max = intnx('month', &datevar., &forward_max.,'e'); if first.&id2 then following = .; n = intck(&period.,&datename., min(following, forward_max)); do i=1 to n-1; &datename. = intnx(&period.,&datename. ,1,"E"); output; end; drop following forward_max n i; run; proc sort data=&outset. nodupkey; by &idvar. &datevar. &datename.; run; /* House Cleaning */ proc sql; drop table __temp; quit; options notes; %put ### DONE . Dataset &OUTSET. with &period. Frequency Generated ; %put ; %MEND populate_own; /* Generic expand in a data set with a start date column and an end date column. */ %macro expand(data=, out=, id_vars=, start_date=, end_date=, freq='day', new_date_name=date); /*freq in ('day', 'month')*/ data __expanded; set &data.; format &new_date_name. YYMMDDN8.; do i = 0 to intck(&freq., &start_date., &end_date.); &new_date_name. = intnx(&freq., &start_date., i, 'e'); output; end; drop &start_date. &end_date. i; run; proc sort data=__expanded out=&out nodupkey; by &id_vars. &new_date_name.; run; proc delete data=__expanded; run; %mend expand; /* USD to Foreign FX Conversion Rate from Compustat*/ %macro compustat_fx(out=); data usd_curcdd; curcdd='USD'; datadate=input(put(19500101,8.),yymmdd8.); fx=1; format datadate yymmddn8.; run; /* comp.exrt_dly only starts in 1982 and since we convert to USD we know that the fx for USD is 1 */ proc sql; create table __fx1 as select distinct a.tocurd as curcdd , a.datadate, b.exratd/a.exratd as fx /*fx is quoted as x/USD so to go from x to USD do x*fx*/ from comp.exrt_dly a , comp.exrt_dly b where a.fromcurd = 'GBP' and b.tocurd = 'USD' /*b.exratd is always from GBP to USD, a.exratd is from GBP to currency X*/ and a.fromcurd = b.fromcurd and a.datadate = b.datadate; quit; data __fx2; set __fx1 usd_curcdd; run; proc sort data = __fx2; by curcdd descending datadate; run ; /* Carry forward fx observations in case gaps*/ data __fx3; format date YYMMDDN8.; set __fx2; by curcdd; date = datadate; output; following = lag(date); if first.curcdd then following = date+1; n = following-date; do i=1 to n-1; date = date+1; output; end; drop datadate following n i; run; proc sort data=__fx3 out=&out nodupkey; by curcdd date; run; proc delete data=usd_curcdd __fx1 __fx2 __fx3; run; %mend compustat_fx; ********************************************************************************************************************** * MACRO - Add Primary Security from Compustat * ********************************************************************************************************************** The macro expects a data set with gvkey and iid. It then returns the same dataset with a column "primary_sec" which is 1 when a security is the primary security in the US, Canada or internationally. Importantly, a given company (gvkey) can potentially have up to 3 primary securities although it happens very rarely. In the full Compustat monthly dataset, of all gvkey-eom pairs we have 0 primary securities: 1.82% 1 primary securities: 95.10% 2 primary securities: 3.08% 3 primary securities: 0.01% ; %macro add_primary_sec(data=, out=, date_var=); proc sql; create table __prihistrow as select gvkey, itemvalue as prihistrow, effdate, thrudate from comp.g_sec_history where item = 'PRIHISTROW'; quit; proc sql; create table __prihistusa as select gvkey, itemvalue as prihistusa, effdate, thrudate from comp.sec_history where item = 'PRIHISTUSA'; quit; proc sql; create table __prihistcan as select gvkey, itemvalue as prihistcan, effdate, thrudate from comp.sec_history where item = 'PRIHISTCAN'; quit; proc sql; create table __header as select gvkey, prirow, priusa, prican from comp.company outer union corr select gvkey, prirow, priusa, prican from comp.g_company; quit; proc sort data=__header nodupkey; by gvkey; run; /* Only one duplicate (gvkey=254381)*/ proc sql; create table __data1 as select distinct a.*, coalesce(b.prihistrow, e.prirow) as prihistrow, coalesce(c.prihistusa, e.priusa) as prihistusa, coalesce(d.prihistcan, e.prican) as prihistcan /* Prefer historical primary sec indicator, if this is not available, use header. Distinct avoid around 400 dup observations due to gvkey=213098 having overlapping prihistrow in g_sec_history */ from &data. as a left join __prihistrow as b on a.gvkey=b.gvkey and a.&date_var.>=b.effdate and (a.&date_var.<=b.thrudate or missing(b.thrudate)) left join __prihistusa as c on a.gvkey=c.gvkey and a.&date_var.>=c.effdate and (a.&date_var.<=c.thrudate or missing(c.thrudate)) left join __prihistcan as d on a.gvkey=d.gvkey and a.&date_var.>=d.effdate and (a.&date_var.<=d.thrudate or missing(d.thrudate)) left join __header as e on a.gvkey=e.gvkey; create table __data2 as select *, (not missing(iid) and (iid=prihistrow or iid=prihistusa or iid=prihistcan)) as primary_sec /* If the security id is identified as primary in either USA, CANADA or Rest of the World it is deemed a primary security */ from __data1; quit; data &out.; set __data2(drop = prihistrow prihistusa prihistcan); proc delete data=__prihistrow __prihistusa __prihistcan __header __data1 __data2; run; %mend add_primary_sec; /* MACRO - COMPUSTAT EXCHANGES */* This macro returns a dataset with classifications of individual exchanges in terms of country and an indicator of whether it's a normal or special exchange ; %macro comp_exchanges(out=); /* Exchange Classification */ /* What about 153? . Could consider excluding 111 since its the same securities traded on 110. This is just due to a rule in Thailand limiting the ownership of foreign investors in Thailand https://thaishares.com/nvdr/*/ %let special_exchanges = (0, 1, 2, 3, 4, 15, 16, 17, 18, 21, /* US exchanges not in NYSE, Amex and NASDAQ */ 13, 19, 20, 127, 150, /* AIAF Mercado De Renta Fija --> Spanish exchange for trading debt securities https://practiceguides.chambers.com/practice-guides/capital-markets-debt-2019/spain/1-debt-marketsexchanges */ 157, 229, 263, 269, 281, 283, 290, 320, 326, 341, 342, 347, 348, 349, /* BATS Chi-X Europe --> Trades stocks from various european exchanges. Should we keep it?*/ 352) /* CHI-X Australia --> Only Trades securities listed on ASX (exchg=106). Should we keep it?*/ ; /* Determine Country of Exchange (Note that we assume that this is constant through time) */ proc sql; create table __ex_country1 as select distinct exchg, excntry from comp.g_security outer union corr select distinct exchg, excntry from comp.security; create table __ex_country2 as select distinct exchg, case when count(excntry)>1 then 'multi_national' /*, calculated count > 1 as multi_national*/ else excntry end as excntry from __ex_country1 where not missing(excntry) and not missing(exchg) group by exchg; create table __ex_country3 as select a.*, b.exchgdesc from __ex_country2 as a left join comp.r_ex_codes as b on a.exchg=b.exchgcd; create table &out. as select *, (excntry ^= 'multi_national' and exchg not in &special_exchanges.) as exch_main from __ex_country3; quit; proc delete data= __ex_country1 __ex_country2 __ex_country3; run; %mend comp_exchanges; ********************************************************************************************************************** * US - Data From CRSP ********************************************************************************************************************* ; %macro prepare_crsp_sf(freq=m); /* freq in ('d', 'm') for daily and monthly. Returns crsp_msf if p=m and crsp_dsf if p=d*/ /* CRSP with Company Information*/ proc sql; create table __crsp_sf1 as select a.permno, a.permco, a.date, (a.prc < 0) as bidask, abs(a.prc) as prc, a.shrout/1000 as shrout, calculated prc * calculated shrout as me, a.ret, a.retx, a.cfacshr, a.vol, case when a.prc > 0 and a.askhi > 0 then a.askhi else . end as prc_high, /* Highest price when prc is not the bid-ask average: https://wrds-web.wharton.upenn.edu/wrds/query_forms/variable_documentation.cfm?vendorCode=CRSP&libraryCode=crspa&fileCode=dsf&id=askhi*/ case when a.prc > 0 and a.bidlo > 0 then a.bidlo else . end as prc_low, /* Lowest price when prc is not the bid-ask average: https://wrds-web.wharton.upenn.edu/wrds/query_forms/variable_documentation.cfm?vendorCode=CRSP&libraryCode=crspa&fileCode=dsf&id=bidlo */ b.shrcd, b.exchcd, c.gvkey, c.liid as iid, /*http://www.crsp.org/products/documentation/crspccmlink-security-link-history*/ b.exchcd in (1, 2, 3) as exch_main from crsp.&freq.sf as a left join crsp.&freq.senames as b on a.permno=b.permno and a.date>=namedt and a.date<=b.nameendt left join crsp.ccmxpf_lnkhist as c on a.permno=c.lpermno and (a.date>=c.linkdt or missing(c.linkdt)) and (a.date<=c.linkenddt or missing(c.linkenddt)) and c.linktype in ('LC', 'LU', 'LS'); quit; /* Adjust trading volume following Gao and Ritter (2010)*/ proc sql; update __crsp_sf1 set vol = case when date < '01FEB2001'd then vol / 2 when date <= '31DEC2001'd then vol / 1.8 when date < '31DEC2003'd then vol / 1.6 else vol end where exchcd = 3; quit; /* Add dividend and dollar volume */ proc sort data=__crsp_sf1; by permno date; run; data __crsp_sf2; set __crsp_sf1; by permno; dolvol = abs(prc) * vol; div_tot = (ret-retx)*lag(prc)*(cfacshr/lag(cfacshr)); /* The CFACSHR part is to put it on the pr share basis of the current date */ if first.permno then div_tot=.; run; /* Incorporate Delisting Returns */ %if &freq.=d %then %do; proc sql; create table __crsp_sf3 as select a.*, b.dlret, b.dlstcd from __crsp_sf2 as a left join crsp.&freq.sedelist as b on a.permno=b.permno and a.date=b.dlstdt; quit; %end; %if &freq.=m %then %do; proc sql; create table __crsp_sf3 as select a.*, b.dlret, b.dlstcd from __crsp_sf2 as a left join crsp.&freq.sedelist as b on a.permno=b.permno and year(a.date)=year(b.dlstdt) and month(a.date)=month(b.dlstdt); quit; %end; data __crsp_sf4; set __crsp_sf3; if missing(dlret) and (dlstcd=500 or (520<=dlstcd<=584)) then dlret=-0.3; /*If delisting is missing and is for performance related reasons. Set to -30%. This is relevant to 155 observations only*/ if missing(ret) and not missing(dlret) then ret=0; ret= (1+ret)*(1+coalesce(dlret, 0))-1; /*If missing set to zero*/ drop dlret dlstcd; run; * Excess Return; %if &freq.=d %then %let scale=21; %if &freq.=m %then %let scale=1; proc sql; create table __crsp_sf5 as select a.*, a.ret-coalesce(b.t30ret, c.rf)/&scale. as ret_exc /* I prefer crsp.mcti but FF has monthly updates */ from __crsp_sf4 as a left join crsp.mcti as b on year(a.date)=year(b.caldt) and month(a.date)=month(b.caldt) left join ff.factors_monthly as c on year(a.date)=year(c.date) and month(a.date)=month(c.date); quit; * Company Market Equity; proc sql; create table __crsp_sf6 as select *, sum(me) as me_company from __crsp_sf5 group by permco, date; quit; * Make volume comparable across daily and monthly set: https://wrds-web.wharton.upenn.edu/wrds/query_forms/variable_documentation.cfm?vendorCode=CRSP&libraryCode=crspa&fileCode=dsf&id=vol; %if &freq.=m %then %do; proc sql; update __crsp_sf6 set vol = vol*100, dolvol = dolvol*100; quit; %end; proc sort nodupkey data=__crsp_sf6; by permno date; run; /*In monthly file: Two duplicates 15075-20180131 and 86812-20190731 In daily file 13 obs*/ data crsp_&freq.sf; set __crsp_sf6; proc delete data= __crsp_sf1 __crsp_sf2 __crsp_sf3 __crsp_sf4 __crsp_sf5 __crsp_sf6; %mend prepare_crsp_sf; ********************************************************************************************************************** * World - Data From Compustat ********************************************************************************************************************* ; %macro prepare_comp_sf(freq=); /* freq in (d, m,both) */ /* SECD has a lot of missing CSHOC. Therefore we use information from Accounting Statements. This is not a problem for g_secd*/ %let comp_cond = indfmt='INDL' and datafmt='STD' and popsrc='D' and consol='C'; proc sql; create table __firm_shares1 as select gvkey, datadate, cshoq as csho_fund, ajexq as ajex_fund from comp.fundq where &comp_cond. and not missing(cshoq) and not missing(ajexq) outer union corr select gvkey, datadate, csho as csho_fund, ajex as ajex_fund from comp.funda where &comp_cond. and not missing(csho) and not missing(ajex); quit; %populate_own(inset=__firm_shares1, outset=__firm_shares2, datevar=datadate, idvar=gvkey, datename=ddate, forward_max=12, period = 'day'); proc sql; create table __comp_dsf_na as select a.gvkey, a.iid, a.datadate, a.tpci, a.exchg, a.prcstd, a.curcdd, a.prccd as prc_local, a.ajexdi, case when a.prcstd^=5 then a.prchd else . end as prc_high_lcl, /* Highest price when prc is not the bid-ask average: https://wrds-web.wharton.upenn.edu/wrds/query_forms/variable_documentation.cfm?vendorCode=COMP&libraryCode=compd&fileCode=secd&id=prchd */ case when a.prcstd^=5 then a.prcld else . end as prc_low_lcl, /* Lowest price when prc is not the bid-ask average: https://wrds-web.wharton.upenn.edu/wrds/support/Data/_001Manuals%20and%20Overviews/_001Compustat/_001North%20America%20-%20Global%20-%20Bank/_000dataguide/?file_name=Data/_001Manuals%20and%20Overviews/_001Compustat/_001North%20America%20-%20Global%20-%20Bank/_000dataguide */ cshtrd, coalesce(a.cshoc/1e6, b.csho_fund*b.ajex_fund/a.ajexdi) as cshoc, /* Prefer cshoc but if missing choose shares outstanding from accounting statement adjusted for issuance activity*/ (a.prccd/a.ajexdi*a.trfd) as ri_local, a.curcddv, a.div, a.divd, a.divsp /* Dividend Variables */ from comp.secd as a left join __firm_shares2 as b on a.gvkey=b.gvkey and a.datadate=b.ddate; /* Adjust trading volume of NASDAQ stocks following Gao and Ritter (2010)*/ update __comp_dsf_na set cshtrd = case when datadate < '01FEB2001'd then cshtrd / 2 when datadate <= '31DEC2001'd then cshtrd / 1.8 when datadate < '31DEC2003'd then cshtrd / 1.6 else cshtrd end where exchg = 14; create table __comp_dsf_global as select gvkey, iid, datadate, tpci, exchg, prcstd, curcdd, prccd/qunit as prc_local, ajexdi, cshoc/1e6 as cshoc, case when prcstd^=5 then prchd/qunit else . end as prc_high_lcl, /* Highest price when prc is not the bid-ask average: https://wrds-web.wharton.upenn.edu/wrds/query_forms/variable_documentation.cfm?vendorCode=COMP&libraryCode=compd&fileCode=secd&id=prchd */ case when prcstd^=5 then prcld/qunit else . end as prc_low_lcl, /* Lowest price when prc is not the bid-ask average: https://wrds-web.wharton.upenn.edu/wrds/support/Data/_001Manuals%20and%20Overviews/_001Compustat/_001North%20America%20-%20Global%20-%20Bank/_000dataguide/?file_name=Data/_001Manuals%20and%20Overviews/_001Compustat/_001North%20America%20-%20Global%20-%20Bank/_000dataguide */ cshtrd, ((calculated prc_local)/ajexdi*trfd) as ri_local, curcddv, div, divd, divsp from comp.g_secd; create table __comp_dsf1 as select * from __comp_dsf_na outer union corr select * from __comp_dsf_global; quit; /* Add FX */ %compustat_fx(out=fx); proc sql; create table __comp_dsf2 as select a.*, b.fx as fx, c.fx as fx_div from __comp_dsf1 as a left join fx as b on a.curcdd=b.curcdd and a.datadate=b.date left join fx as c on a.curcddv=c.curcdd and a.datadate=c.date; quit; data __comp_dsf3; set __comp_dsf2; /* Price Adjustment */ prc = prc_local*fx; prc_high = prc_high_lcl*fx; prc_low = prc_low_lcl*fx; me = prc*cshoc; dolvol = cshtrd*prc; ri = ri_local*fx; /* Dividend Adjustment (set to zero if missing)*/ div_tot = coalesce(div, 0)*fx_div; div_cash = coalesce(divd, 0)*fx_div; div_spc = coalesce(divsp, 0)*fx_div; drop div divd divsp fx_div curcddv prc_high_lcl prc_low_lcl; run; * Create Daily, Monthly or Both Datasets; %if &freq. = m or &freq. = d %then %let iter_max = 1; %if &freq. = both %then %let iter_max = 2; %do iter=1 %to &iter_max.; %if &freq. = m or &freq. = d %then %let freq_use = &freq.; /* Neccesary because of the case where both daily AND monthly datasets are created */ %if &freq. = both and &iter. = 1 %then %let freq_use = d; %if &freq. = both and &iter. = 2 %then %let freq_use = m; %if &freq_use.=m %then %do; proc sql; create table __comp_msf1 as select *, intnx('month', datadate,0,'E') as eom format=YYMMDDN8., max(max(prc_high/ajexdi), max(prc/ajexdi))*ajexdi as prc_highm, min(min(prc_low/ajexdi), min(prc/ajexdi)) * ajexdi as prc_lowm, sum(div_tot/ajexdi)*ajexdi as div_totm, sum(div_cash/ajexdi)*ajexdi as div_cashm, sum(div_spc/ajexdi)*ajexdi as div_spcm, sum(cshtrd/ajexdi)*ajexdi as cshtrm, sum(dolvol) as dolvolm from __comp_dsf3 group by gvkey, iid, calculated eom; create table __comp_msf2 as select * from __comp_msf1(drop= cshtrd div_tot div_cash div_spc dolvol prc_high prc_low) where not missing(prc_local) and not missing(curcdd) and prcstd in (3, 4, 10) /* This is rather important when choosing last.eom later. Otherwise we sometimes have an end of month observations which is a dividend distribution rather than a trading day */ order by gvkey, iid, eom, datadate; quit; data __comp_msf2; set __comp_msf2; rename div_totm=div_tot div_cashm=div_cash div_spcm=div_spc dolvolm=dolvol prc_highm=prc_high prc_lowm=prc_low; run; /* Choose Last observation in Month */ data __comp_msf3; set __comp_msf2; by gvkey iid eom; if last.eom; run; /* Add Information from SECM */ proc sql; create table __comp_secm1 as select a.gvkey, a.iid, a.datadate, intnx('month', a.datadate,0,'E') as eom format=YYMMDDN8., a.tpci, a.exchg, a.curcdm as curcdd, a.prccm as prc_local, a.prchm as prc_high, a.prclm as prc_low, a.ajexm as ajexdi, coalesce(a.cshom/1e6, a.csfsm/1e3, a.cshoq, b.csho_fund*b.ajex_fund/a.ajexm) as cshoc, /* Notive again that I impute with shares from accounting statements in case it is missing*/ a.dvpsxm, a.cshtrm, a.curcddvm, a.prccm/a.ajexm*a.trfm as ri_local, /*ri_local = local return index [1]*/ c.fx as fx, d.fx as fx_div from comp.secm as a left join __firm_shares2 as b on a.gvkey=b.gvkey and a.datadate=b.ddate left join fx as c on a.curcdm=c.curcdd and a.datadate=c.date left join fx as d on a.curcddvm=d.curcdd and a.datadate=d.date; update __comp_secm1 set cshtrm = case when datadate < '01FEB2001'd then cshtrm / 2 when datadate <= '31DEC2001'd then cshtrm / 1.8 when datadate < '31DEC2003'd then cshtrm / 1.6 else cshtrm end where exchg = 14; quit; data __comp_secm2; set __comp_secm1; /* Price Adjustment */ if curcdd='USD' then fx=1; prc = prc_local*fx; prc_high = prc_high*fx; prc_low = prc_low*fx; me = prc*cshoc; dolvol = cshtrm*prc; ri = ri_local*fx; /* Dividend Adjustment*/ if curcddvm='USD' then fx_div=1; div_tot = dvpsxm*fx_div; div_cash = .; /* Not available in SECM*/ div_spc = .; /* Not available in SECM*/ drop dvpsxm fx_div curcddvm; run; %let common_vars=gvkey, iid, datadate, eom, tpci, exchg, curcdd, prc_local, prc_high, prc_low, ajexdi, cshoc, ri_local, fx, prc, me, cshtrm, dolvol, ri, div_tot, div_cash, div_spc; proc sql; create table __comp_msf4 as select &common_vars., prcstd, 'secd' as source from __comp_msf3 union select &common_vars., 10 as prcstd, 'secm' as source from __comp_secm2; /* This ensures that all observations are kept in the __returns step. Erroneous prcstd codes have been screen out */ create table __comp_msf5 as select * from __comp_msf4 group by gvkey, iid, eom having count(*)=1 | (count(*)=2 and source='secd'); /* If a security has an observation in both SECD and SECM. Prefer the observation from SECD*/ quit; proc sort nodupkey data=__comp_msf5(drop=source) out=__comp_msf6; by gvkey iid eom; run; /* DUPLICATES should always be 0!*/ proc delete data=__comp_msf1 __comp_msf2 __comp_msf3 __comp_msf4 __comp_msf5 __comp_secm1 __comp_secm2; run; %let base=__comp_msf6; %let period = 'month'; %let out = comp_msf; %end; %if &freq_use.=d %then %do; %let base=__comp_dsf3; %let period = 'day'; %let out = comp_dsf; %end; /* Compute Returns */ proc sort nodupkey data= &base. out=__comp_sf1; by gvkey iid datadate; run; /* Very important to know if there are any duplicates!!*/ data __returns; set __comp_sf1(where = (not missing(ri) and prcstd in (3, 4, 10))); /* The screen is important, see [1] */ by gvkey iid; ret = ri/lag(ri)-1; ret_local = ri_local/lag(ri_local)-1; ret_lag_dif = intck(&period., lag(datadate), datadate); if first.iid then do; ret=.; ret_local=.; ret_lag_dif=.; end; /* Handle situations where currency code changes */ if first.iid=0 and curcdd^=lag(curcdd) then do; ret_local = ret; end; keep gvkey iid datadate ret ret_local ret_lag_dif; run; /* Handling Delisting */ data __sec_info; set comp.security comp.g_security; run; /* Combine SECURITY and G_SECURITY*/ data __delist1; set __returns(where=(not missing(ret_local) and ret_local^=0)); /* Take delisting date to be last day of trading with non missing and non zero return [2]*/ by gvkey iid datadate; if last.iid; run; proc sql; create table __delist2 as select a.gvkey, a.iid, a.datadate, b.secstat, b.dlrsni from __delist1 as a left join __sec_info as b on a.gvkey=b.gvkey and a.iid=b.iid; create table __delist3 as select gvkey, iid, datadate as date_delist, case when dlrsni in ('02', '03') then -0.3 else 0 end as dlret from __delist2 where secstat='I'; quit; * Incorporate Delisting Return; proc sql; create table __comp_sf2 as select a.*, b.ret, b.ret_local, b.ret_lag_dif, c.date_delist, c.dlret from &base as a left join __returns as b on a.gvkey=b.gvkey and a.iid=b.iid and a.datadate=b.datadate left join __delist3 as c on a.gvkey=c.gvkey and a.iid=c.iid; quit; data __comp_sf3; set __comp_sf2; where datadate<=date_delist or missing(date_delist); /* In a sample of 104,377 this removes 1,434 obs*/ if datadate=date_delist then do; ret = (1+ret)*(1+dlret)-1; ret_local = (1+ret_local)*(1+dlret)-1; end; drop ri ri_local date_delist dlret; run; /* Excess Return */ %if &freq_use.=d %then %let scale=21; %if &freq_use.=m %then %let scale=1; proc sql; create table __comp_sf4 as select a.*, a.ret-coalesce(b.t30ret, c.rf)/&scale. as ret_exc /* I prefer crsp.mcti but FF has monthly updates */ from __comp_sf3 as a left join crsp.mcti as b on year(a.datadate)=year(b.caldt) and month(a.datadate)=month(b.caldt) left join ff.factors_monthly as c on year(a.datadate)=year(c.date) and month(a.datadate)=month(c.date); quit; /* Add Exchange Information*/ %comp_exchanges(out=__exchanges); proc sql; create table __comp_sf5 as select a.*, b.excntry, b.exch_main from __comp_sf4 as a left join __exchanges as b on a.exchg=b.exchg; quit; /* Add Primary Security Indicator?*/ %add_primary_sec(data=__comp_sf5, out=__comp_sf6, date_var=datadate); /* Output */ proc sort nodupkey data=__comp_sf6 out=&out.; by gvkey iid datadate; run; %end; /* proc freq data=comp_dsf; tables ret_day_dif; run; *Check: 97.2% of non missing ret_day_dif are <=3. 98.8% are <=4; 99.75% are <=10. This might be a reasonable general cutoff for settings returns to 0*/ proc delete data=__firm_shares1 __firm_shares2 fx __comp_dsf_na __comp_dsf_global __comp_dsf1 __comp_dsf2 __comp_dsf3 __returns __sec_info __delist1 __delist2 __delist3 __comp_sf1 __comp_sf2 __comp_sf3 __comp_sf4 __comp_sf5 __comp_sf6 __exchanges &base.; run; %mend prepare_comp_sf; /* COMBINE CRSP AND COMPUSTAT WITH CRSP PREFERENCE*/ %macro combine_crsp_comp_sf(out_msf=, out_dsf=, crsp_msf=, comp_msf=, crsp_dsf=, comp_dsf=); /* Monthly Files */ proc sql; create table __msf_world1 as select permno as id, PERMNO as permno, PERMCO as permco, GVKEY as gvkey, iid, 'USA' as excntry length=3, exch_main, (shrcd in (10, 11, 12)) as common, 1 as primary_sec, bidask, shrcd as crsp_shrcd, exchcd as crsp_exchcd, '' as comp_tpci, . as comp_exchg, 'USD' as curcd, 1 as fx, date, intnx('month',date,0,'E') as eom format=YYMMDDN8., cfacshr as adjfct, shrout as shares, me, me_company, prc, prc as prc_local, prc_high, prc_low, dolvol, vol as tvol, RET as ret, ret as ret_local, ret_exc, 1 as ret_lag_dif, div_tot, . as div_cash, . as div_spc, 1 as source_crsp from &crsp_msf. outer union corr select case when prxmatch("/W/", iid) then input(cats('3', gvkey, substr(iid, 1, 2)), 9.0) when prxmatch("/C/", iid) then input(cats('2', gvkey, substr(iid, 1, 2)), 9.0) else input(cats('1', gvkey, substr(iid, 1, 2)), 9.0) /* IID characters can only take 3 possible valued: http://zeerovery.nl/blogfiles/Comp-IID.pdf*/ end as id, . as permno, . as permco, gvkey, iid, excntry, exch_main, (tpci='0') as common, primary_sec, (prcstd = 4) as bidask, . as crsp_shrcd, . as crsp_exchcd, tpci as comp_tpci, exchg as comp_exchg, curcdd as curcd, fx, datadate as date, eom, ajexdi as adjfct, cshoc as shares, me, me as me_company, prc, prc_local, prc_high, prc_low, dolvol, cshtrm as tvol, ret_local, ret, ret_exc, ret_lag_dif, div_tot, div_cash, div_spc, 0 as source_crsp from &comp_msf.; quit; /* Add Excess Return over Month t+1*/ proc sort data=__msf_world1 ; by id descending eom; run; data __msf_world2; set __msf_world1; ret_exc_lead1m = lag(ret_exc); if lag(id)^=id or lag(ret_lag_dif)^=1 then ret_exc_lead1m = .; run; /* Daily Files */ proc sql; create table __dsf_world1 as select permno as id, 'USA' as excntry length=3, exch_main, (shrcd in (10, 11, 12)) as common, 1 as primary_sec, bidask, 'USD' as curcd, 1 as fx, DATE as date format=YYMMDDN8., intnx('month',DATE,0,'E') as eom format=YYMMDDN8., cfacshr as adjfct, shrout as shares, me, dolvol, vol as tvol, prc, prc_high, prc_low, ret as ret_local, RET as ret, ret_exc, 1 as ret_lag_dif, 1 as source_crsp /* More memory efficient with integer instead of character vector*/ from &crsp_dsf. outer union corr select case when prxmatch("/W/", iid) then input(cats('3', gvkey, substr(iid, 1, 2)), 9.0) when prxmatch("/C/", iid) then input(cats('2', gvkey, substr(iid, 1, 2)), 9.0) else input(cats('1', gvkey, substr(iid, 1, 2)), 9.0) /* IID characters can only take 3 possible valued: http://zeerovery.nl/blogfiles/Comp-IID.pdf*/ end as id, excntry, exch_main, (tpci='0') as common, primary_sec, (prcstd = 4) as bidask, curcdd as curcd, fx, datadate as date, intnx('month',datadate,0,'E') as eom format=YYMMDDN8., ajexdi as adjfct, cshoc as shares, me, dolvol, cshtrd as tvol, prc, prc_high, prc_low, ret_local, ret, ret_exc, ret_lag_dif, 0 as source_crsp from &comp_dsf.; quit; /* Choose the main observation based on monthly data */ * If multiple observations for the same GVKEY-IID, Then choose CRSP as the main observation; proc sql; create table __obs_main as select id, gvkey, iid, eom, (count(gvkey) in (0, 1) or (count(gvkey)>1 and source_crsp=1)) as obs_main from __msf_world2 group by gvkey, iid, eom; create table __msf_world3 as select a.*, b.obs_main from __msf_world2 as a left join __obs_main as b on a.id = b.id and a.eom = b.eom; create table __dsf_world2 as select a.*, b.obs_main from __dsf_world1 as a left join __obs_main as b on a.id = b.id and a.eom = b.eom; quit; proc sort data=__msf_world3 out=&out_msf. nodupkey; by id eom; run; proc sort data=__dsf_world2 out=&out_dsf. nodupkey; by id date; run; proc delete data= __msf_world1 __msf_world2 __msf_world3 __dsf_world1 __dsf_world2 __obs_main; run; %mend combine_crsp_comp_sf; * MACRO: CLEAN_COMP_MSF - Remove obvious Compustat data errors by setting return to missing. - Currently only implemented for monthly Compustat file, should be expanded to daily file ; %macro clean_comp_msf(data=); proc sql; update &data. set ret=., ret_local=., ret_exc=. where gvkey = '002137' and iid = '01C' and eom in ('31DEC1983'd, '31JAN1984'd); update &data. set ret=., ret_local=., ret_exc=. where gvkey = '013633' and iid = '01W' and eom in ('28FEB1995'd); quit; %mend; * MACRO: MARKET RETURNS If wins_comp=1, need to supply wins_data as well. ; %macro market_returns(out=, data=, freq=m, wins_comp=1, wins_data=, cap_data=); %if &freq.=d %then %do; %let dt_col = date; %let max_date_lag = 14; %end; %if &freq.=m %then %do; %let dt_col = eom; %let max_date_lag = 1; %end; /* Create Index Data */ proc sql; create table updated_data as select a.*, b.nyse_p80 from &data. as a left join &cap_data. as b on a.eom = b.eom; quit; data updated_data; set updated_data; if me <= nyse_p80 then me_cap = me; else me_cap = nyse_p80; drop nyse_p80; run; /* Create Index Data */ proc sql; create table __common_stocks1 as select distinct source_crsp, id, date, eom, excntry, obs_main, exch_main, primary_sec, common, ret_lag_dif, me, me_cap, dolvol, ret, ret_local, ret_exc from updated_data order by id, &dt_col.; quit; data __common_stocks2; set __common_stocks1; by id; me_lag1 = lag(me); me_cap_lag1 = lag(me_cap); dolvol_lag1 = lag(dolvol); if first.id then do; me_lag1 = .; me_cap_lag1 = .; dolvol_lag1 = .; end; run; %if &wins_comp. = 1 %then %do; %if &freq.=m %then %do; proc sql; create table __common_stocks3 as select a.*, b.ret_exc_0_1, b.ret_exc_99_9, b.ret_0_1, b.ret_99_9, b.ret_local_0_1, b.ret_local_99_9 from __common_stocks2 as a left join &wins_data. as b on a.eom=b.eom; quit; %end; %if &freq.=d %then %do; proc sql; create table __common_stocks3 as select a.*, b.ret_exc_0_1, b.ret_exc_99_9, b.ret_0_1, b.ret_99_9, b.ret_local_0_1, b.ret_local_99_9 from __common_stocks2 as a left join &wins_data. as b on year(a.date)=b.year and month(a.date)=b.month; quit; %end; proc sql; * Winsorize returns; update __common_stocks3 set ret = ret_99_9 where ret > ret_99_9 and source_crsp = 0 and not missing(ret); update __common_stocks3 set ret = ret_0_1 where ret < ret_0_1 and source_crsp = 0 and not missing(ret); * Winsorize local returns; update __common_stocks3 set ret_local = ret_local_99_9 where ret_local > ret_local_99_9 and source_crsp = 0 and not missing(ret_local); update __common_stocks3 set ret_local = ret_local_0_1 where ret_local < ret_local_0_1 and source_crsp = 0 and not missing(ret_local); * Winsorize excess returns; update __common_stocks3 set ret_exc = ret_exc_99_9 where ret_exc > ret_exc_99_9 and source_crsp = 0 and not missing(ret_exc); update __common_stocks3 set ret_exc = ret_exc_0_1 where ret_exc < ret_exc_0_1 and source_crsp = 0 and not missing(ret_exc); alter table __common_stocks3 drop ret_exc_0_1, ret_exc_99_9, ret_0_1, ret_99_9, ret_local_0_1, ret_local_99_9; quit; %end; %if &wins_comp. = 0 %then %do; data __common_stocks3; set __common_stocks2; run; %end; proc sql; create table mkt1 as select excntry, &dt_col., count(*) as stocks, sum(me_lag1) as me_lag1, sum(me_cap_lag1) as me_cap_lag1, sum(dolvol_lag1) as dolvol_lag1, sum(ret_local*me_lag1)/(calculated me_lag1) as mkt_vw_lcl, sum(ret_local*me_cap_lag1)/(calculated me_cap_lag1) as mkt_vw_cap_lcl, mean(ret_local) as mkt_ew_lcl, sum(ret*me_lag1)/(calculated me_lag1) as mkt_vw, sum(ret*me_cap_lag1)/(calculated me_cap_lag1) as mkt_vw_cap, mean(ret) as mkt_ew, sum(ret_exc*me_lag1)/(calculated me_lag1) as mkt_vw_exc, sum(ret_exc*me_cap_lag1)/(calculated me_cap_lag1) as mkt_vw_cap_exc, mean(ret_exc) as mkt_ew_exc from __common_stocks3 where obs_main = 1 and exch_main = 1 and primary_sec = 1 and common = 1 and ret_lag_dif <= &max_date_lag. and not missing(me_lag1) and not missing(ret_local) group by excntry, &dt_col.; quit; %if &freq.=m %then %do; data &out.; set mkt1; run; %end; %if &freq.=d %then %do; proc sql; create table &out. as select * from mkt1 group by excntry, year(date), month(date) having stocks / max(stocks) >= 0.25; /* With less than 25% of stocks trading, it's likely that the date is not an official trading date */ quit; %end; proc delete data= __common_stocks1 __common_stocks2 __common_stocks3 mkt1; run; %mend; * MACRO: AP_FACTORS - This macro creates the factors from the 3-factors model of Fama and French (1993) as well as the factors from the 4-factor of Hou, Xue and Zhang (2015). Factors other than market and small minus big, are created using an unconditional double sort on sort and the underlying characteristics following the methodology of Fama and French (1993). Breakpoints are based on all non-microcap stocks within a country. Arguements - out: Name of output dataset - freq: In (d, m) i.e. either daily or monthly - sf: Dataset of &freq. stocks returns - mchars: Dataset of characteristics with monthly frequency - mkt: dataset with market returns. This is also used to identify trading days. - min_stocks_bp: Minimum number of stocks used to create breakpoints. - min_stocks_pf: Minimum number of stocks available in the beginning of a month, to create a valid portfolio. ; %macro ap_factors(out=, freq=, sf=, mchars=, mkt=, min_stocks_bp=, min_stocks_pf=); %if &freq.=d %then %do; /* Daily return data */ proc sql; create table world_sf1 as select excntry, id, date, eom, ret_exc from &sf. where ret_lag_dif <= 5 and not missing(ret_exc); * Impose a maximum lag of 5 days between return calculation; quit; %winsorize_own(inset=world_sf1, outset=world_sf2, sortvar=eom, vars=ret_exc, perc_low=0.1, perc_high=99.9); /* Winsorize Returns at 0.1% (Note I winsorize by month) */ %let __date_col = date; %end; %if &freq.=m %then %do; /* Monthly return data */ proc sql; create table world_sf1 as select excntry, id, eom, ret_exc from &sf. where ret_lag_dif = 1 and not missing(ret_exc); quit; %winsorize_own(inset=world_sf1, outset=world_sf2, sortvar=eom, vars=ret_exc, perc_low=0.1, perc_high=99.9); /* Winsorize Returns at 0.1% */ %let __date_col = eom; %end; /* Sorting Variables */ proc sql; create table base1 as select id, eom, size_grp, excntry, me, market_equity, be_me, at_gr1, niq_be, source_crsp, exch_main, obs_main, common, comp_exchg, crsp_exchcd, primary_sec, ret_lag_dif from &mchars.; quit; proc sort data=base1 out=base2; by id eom; run; %macro temp(); /* Lag variables used at portfolio rebalacing */ %let cols_lag = comp_exchg crsp_exchcd exch_main obs_main common primary_sec excntry size_grp me be_me at_gr1 niq_be; data base3; set base2; by id eom; %do i=1 %to %nwords(&cols_lag.); %let col = %scan(&cols_lag., &i, %str(' ')); &col._l = lag(&col.); if id ^= lag(id) or source_crsp ^= lag(source_crsp) or intck("month", lag(eom), eom)^=1 then &col._l = .; drop &col.; %end; run; %mend; %temp(); /* Screens */ proc sql; create table base4 as select *, case when missing(size_grp_l) then '' when size_grp_l in ('large', 'mega') then 'big' else 'small' end as size_pf from base3 where obs_main_l = 1 and exch_main_l = 1 and common_l = 1 and primary_sec_l = 1 and ret_lag_dif = 1 and not missing(me_l) order by excntry_l, size_grp_l, eom; quit; /* Factors: Three-by-two sort on var and size Fama-French Style*/ %macro sort_ff_style(out=, char=, freq=, min_stocks_bp=, min_stocks_pf=); * Breakpoints (based on NYSE stocks in the US and non-microcap stocks outside of the US); proc sql; create table bp_stocks as select * from base4 where ((size_grp_l in ('small', 'large', 'mega') and excntry_l ^= 'USA') or ((crsp_exchcd_l=1 or comp_exchg_l=11) and excntry_l = 'USA')) and not missing(&char._l) order by eom, excntry_l; quit; proc means data=bp_stocks noprint; by eom excntry_l; var &char._l; output out=bps(drop=_type_ _freq_) N=n P30 = bp_p30 P70=bp_p70; run; * Create weights by end of month; proc sql; create table weights1 as select a.excntry_l, a.id, a.eom, a.size_pf, a.me_l, case when a.&char._l >= b.bp_p70 then 'high' when a.&char._l >= b.bp_p30 then 'mid' else 'low' end as char_pf from base4 as a left join bps as b on a.excntry_l = b.excntry_l and a.eom = b.eom where b.n >= &min_stocks_bp. and not missing(a.&char._l) and size_pf^=''; create table weights2 as select *, me_l / sum(me_l) as w from weights1 group by excntry_l, size_pf, char_pf, eom having count(*) >= &min_stocks_pf.; quit; * Match with return data; proc sql; create table returns as select a.*, b.w, b.size_pf, b.char_pf from world_sf2 as a inner join weights2 as b on a.id = b.id and a.eom=b.eom and a.excntry=b.excntry_l; quit; * Create portfolio returns; proc sql; create table pfs1 as select "&char." as characteristic, excntry, size_pf, char_pf, &__date_col., sum(ret_exc*w) as ret_exc from returns group by excntry, size_pf, char_pf, &__date_col.; quit; proc sort data=pfs1 out = pfs2; by characteristic excntry &__date_col.; run; proc transpose data=pfs2 delimiter=_ out=pfs3(drop=_name_); by characteristic excntry &__date_col.; var ret_exc; id size_pf char_pf; run; data &out.; set pfs3; lms = (small_high + big_high) / 2 - (small_low + big_low) / 2; smb = (small_high + small_mid + small_low) / 3 - (big_high + big_mid + big_low) / 3; keep characteristic excntry &__date_col. lms smb; run; %mend; /* Create Individual Factors */ %sort_ff_style(out=book_to_market, char=be_me, min_stocks_bp = &min_stocks_bp., min_stocks_pf = &min_stocks_pf.); %sort_ff_style(out=asset_growth, char=at_gr1, min_stocks_bp = &min_stocks_bp., min_stocks_pf = &min_stocks_pf.); %sort_ff_style(out=roeq, char=niq_be, min_stocks_bp = &min_stocks_bp., min_stocks_pf = &min_stocks_pf.); /* Fama and French (1993) */ data ff; set book_to_market; rename lms = hml; rename smb = smb_ff; run; /* Hou, Xue and Zhang (2015) */ data hxz1; set asset_growth roeq; run; proc transpose data=hxz1 out=hxz2; by characteristic excntry &__date_col.; var lms smb; run; proc sort data=hxz2 out = hxz3; by excntry &__date_col.; run; proc transpose data=hxz3 delimiter=_ out=hxz4(drop=_name_); by excntry &__date_col.; var col1; id characteristic _name_; run; data hxz; set hxz4; rename niq_be_lms = roe; smb_hxz = (at_gr1_smb + niq_be_smb) / 2; inv = -at_gr1_lms; run; /* Factor Dataset */ proc sql; create table &out. as select a.excntry, a.&__date_col., a.mkt_vw_exc as mktrf, b.hml, b.smb_ff, c.roe, c.inv, c.smb_hxz from &mkt. as a left join ff as b on a.excntry = b.excntry and a.&__date_col. = b.&__date_col. left join hxz as c on a.excntry = c.excntry and a.&__date_col. = c.&__date_col.; quit; %mend; * MACRO: FILE DELETE; %macro file_delete(file); %let rc= %sysfunc(filename(fref,&file)); %let rc= %sysfunc(fdelete(&fref)); %mend; * MACRO: SAVE_MAIN_DATA_CSV - The macro saves the main data as separate .csv files by country. - By main data we mean data for common stocks that are the primary security of the underlying firm with non-missing return and lagged market equity Arguments - out: Name of the output Zip file (will be saved in &path.) - data: should be the path to the main sas dataset - path: path where data is stored. Should be a scratch directory ; %macro save_main_data_csv(out=, data=, path=, end_date=); * Lagged me data; data main_data1; set &data.; me_lag1 = lag(me); if id ^= lag(id) or intck("month", lag(eom), eom)^=1 then me_lag1 = .; run; * Reorder Variables; data main_data2; retain id date eom source_crsp size_grp obs_main exch_main primary_sec gvkey iid permno permco excntry curcd fx common comp_tpci crsp_shrcd comp_exchg crsp_exchcd adjfct shares me me_lag1; set main_data1; run; * Screen; proc sql; create table main_data3 as select * from main_data2 where primary_sec = 1 and common = 1 and obs_main = 1 and exch_main = 1 and eom <= &end_date.; /*removed not missing(me_lag1) and and ret_lag_dif = 1 and not missing(ret_exc)*/ quit; proc sql noprint; select distinct lowcase(excntry) into :countries separated by ' ' from main_data3; quit; /* Create country .csv files */ %do i=1 %to %nwords(&countries.); %put ################ "&path./&__c..csv" ########################; option nonotes; %let __c = %scan(&countries., &i., %str(' ')); proc export data=main_data3(where=(excntry = upcase("&__c."))) outfile="&path./&__c..csv" dbms=CSV replace; run; option notes; %end; * Zip file for easier download; ods package (newzip) open nopf; %do i=1 %to %nwords(&countries.); %let __c = %scan(&countries., &i., %str(' ')); ods package (newzip) add file="&path./&__c..csv"; %end; ods package (newzip) publish archive properties ( archive_name="&out..zip" archive_path= "&path." ); ods package(newzip) close; /* Delete intermidiate .csv files */ %do i=1 %to %nwords(&countries.); %let __c = %scan(&countries., &i., %str(' ')); %file_delete(file=&path./&__c..csv); %end; proc delete data = main_data1 main_data2 main_data3; run; %mend; * MACRO: SAVE_DAILY_RET_CSV - The macro saves the daily return data as a separate .csv file country-by-country. Arguments - out: Name of the output Zip file (will be saved in &path.) - data: should be the path to world_dsf - path: path where data is stored. Should be a scratch directory - end_date: restricts the output to till the end_date ; %macro save_daily_ret_csv(out=, data=, path=, end_date=); data daily; set &data.; where date <= &end_date.; keep excntry id date source_crsp me ret ret_exc; run; proc sql noprint; select distinct lowcase(excntry) into :countries separated by ' ' from daily; quit; /* Create country .csv files */ option nonotes; %do i=1 %to %nwords(&countries.); %let __c = %scan(&countries., &i., %str(' ')); %put ################ "&path./&__c..csv" ########################; proc export data=daily(where=(excntry = upcase("&__c."))) outfile="&path./&__c..csv" dbms=CSV replace; run; %end; option notes; * Zip file for easier download; ods package (newzip) open nopf; %do i=1 %to %nwords(&countries.); %let __c = %scan(&countries., &i., %str(' ')); ods package (newzip) add file="&path./&__c..csv"; %end; ods package (newzip) publish archive properties ( archive_name="&out..zip" archive_path= "&path." ); ods package(newzip) close; /* Delete intermidiate .csv files */ %do i=1 %to %nwords(&countries.); %let __c = %scan(&countries., &i., %str(' ')); %file_delete(file=&path./&__c..csv); %end; proc delete data = daily; run; %mend; * MACRO: SAVE_MONTHLY_RET_CSV - The macro saves the monthly return data as a .csv across all countries. Arguments - out: Name of the output Zip file (will be saved in &path.) - data: should be the path to world_msf - path: path where data is stored. Should be a scratch directory - end_date: restricts the output to till the end_date ; %macro save_monthly_ret_csv(out=, data=, path=, end_date=); data monthly; set &data.; where eom <= &end_date.; keep excntry id source_crsp eom me ret_exc ret ret_local; run; proc export data=monthly outfile="&path./world_ret_monthly.csv" dbms=CSV replace; run; * Zip file for easier download; ods package (newzip) open nopf; ods package (newzip) add file="&path./world_ret_monthly.csv"; ods package (newzip) publish archive properties ( archive_name="&out..zip" archive_path= "&path." ); ods package(newzip) close; * Delete intermidiate .csv files; %file_delete(file=&path./world_ret_monthly.csv); proc delete data = monthly; run; %mend; /* Footnotes */* [1]: This screen ensures that the return is always computed between two days with a price. The ret_day_dif lets the user check if it's within a reasonable range. Compustat have some observations whre prc and curcdd is missing and div and curcddiv is not missing. In other words dividend ex-date occuring outside trading days should not be used to compute returns. Further, the screen on prcstd ensures that returns are computed with non-stale prices. Specifically prcstd in (3, 4, 10) ensures that the price is taken from observable market data. The main excluded prcstd is 5, which is "No prices available, last actual price was carried forward". This is prevalent in the G_SECD where it accounts for 14.5% of the observations. [2]: Compustat also include a security inactivation date called 'dldtei'. In a subsample of 300 inactive securities, The difference in months between the date I take to be the delisting date and dldtei is 0 for 67% and 1 for 14%. The rest of the observations are scattered from -432 to 317. I think this validates my choice but is is not completely clear. ; ================================================ FILE: README.md ================================================ ## Overview This repository contains the code used for the paper [Is There a Replication Crisis in Finance?](https://onlinelibrary.wiley.com/doi/10.1111/jofi.13249) by Jensen, Kelly and Pedersen (2023). Please cite this paper if you are using the code or data: ``` @article{JensenKellyPedersen2023, author = {Jensen, Theis Ingerslev and Kelly, Bryan and Pedersen, Lasse Heje}, title = {Is There a Replication Crisis in Finance?}, journal = {The Journal of Finance}, volume = {78}, number = {5}, pages = {2465-2518}, year = {2023} } ``` Follow this [link](https://www.dropbox.com/sh/61j1v0sieq9z210/AACdJ68fs5_eT_eJMunwMBWia?dl=0) for a detailed documentation of the data sets. The code consists of the following two self-contained components: - [GlobalFactors](https://github.com/bkelly-lab/ReplicationCrisis/tree/master/GlobalFactors) is a folder with code that creates data sets of global stock returns, firm characteristics, and global long-short factors. __Note that the data can be downloaded without running the code__. The global stock return and firm characteristics can be downloaded from WRDS ([link](https://wrds-www.wharton.upenn.edu/pages/get-data/contributed-data-forms/global-factor-data/)). The long-short factors factor returns used in the paper ([here](https://www.dropbox.com/sh/wcrjok1qyxtrasi/AABZ90GDCUvIzDzijt8Qoo3ha?dl=0)) and the latest version of the factor returns (see weblink below). In addition, we keep a folder with the latest versions of the factor returns and additional data such as the underlying portfolios, market returns, and industry returns ([link](https://www.dropbox.com/sh/xq278bryrj0qf9s/AABUTvTGok91kakyL07LKyQoa?dl=0)). - [Analysis](https://github.com/bkelly-lab/ReplicationCrisis/tree/master/Analysis) is a folder that contains the analysis in the paper, including all figures and tables. This folder takes the global factors as input (either the ones that can be downloaded or the ones that you construct yourself). See also the website [https://JKPfactors.com/](https://JKPfactors.com/), where the most recent long-short factors can be downloaded using a simple drop-down menu.