Repository: tabdelaal/scRNAseq_Benchmark Branch: master Commit: 553869b632f4 Files: 82 Total size: 288.4 KB Directory structure: gitextract_ikyozzhh/ ├── Cross_Validation.R ├── DEgenesMAST.R ├── LICENSE ├── README.md ├── Scripts/ │ ├── run_ACTINN.py │ ├── run_CHETAH.R │ ├── run_CaSTLe.R │ ├── run_Cell_BLAST.py │ ├── run_DigitalCellSorter.py │ ├── run_Garnett_CV.R │ ├── run_Garnett_Pretrained.R │ ├── run_LAmbDA.py │ ├── run_LDA.py │ ├── run_LDA_rejection.py │ ├── run_NMC.py │ ├── run_RF.py │ ├── run_SCINA.R │ ├── run_SVM.py │ ├── run_SVM_rejection.py │ ├── run_SingleR.R │ ├── run_kNN50.py │ ├── run_kNN9.py │ ├── run_moana.py │ ├── run_scID.R │ ├── run_scPred.R │ ├── run_scVI.py │ ├── run_scmap.R │ └── run_singleCellNet.R ├── Snakemake/ │ ├── Cross_Validation.R │ ├── DEgenesMAST.R │ ├── Dockerfiles/ │ │ ├── baseline/ │ │ │ └── Dockerfile │ │ ├── cell_blast/ │ │ │ └── Dockerfile │ │ ├── chetah/ │ │ │ ├── Dockerfile │ │ │ └── install_packages.R │ │ ├── cross_validation/ │ │ │ ├── Dockerfile │ │ │ └── install_packages.R │ │ ├── garnett/ │ │ │ ├── Dockerfile │ │ │ └── install_packages.R │ │ ├── scid/ │ │ │ ├── Dockerfile │ │ │ └── install_packages.R │ │ ├── scmap/ │ │ │ ├── Dockerfile │ │ │ └── install_packages.R │ │ ├── scvi/ │ │ │ └── Dockerfile │ │ ├── singlecellnet/ │ │ │ ├── Dockerfile │ │ │ └── install_packages.R │ │ └── singler/ │ │ ├── Dockerfile │ │ └── install_packages.R │ ├── LICENSE │ ├── README.md │ ├── Scripts/ │ │ ├── run_ACTINN.py │ │ ├── run_CHETAH.R │ │ ├── run_CaSTLe.R │ │ ├── run_Cell_BLAST.py │ │ ├── run_DigitalCellSorter.py │ │ ├── run_Garnett_CV.R │ │ ├── run_Garnett_Pretrained.R │ │ ├── run_LAmbDA.py │ │ ├── run_LDA.py │ │ ├── run_LDA_rejection.py │ │ ├── run_NMC.py │ │ ├── run_RF.py │ │ ├── run_SCINA.R │ │ ├── run_SVM.py │ │ ├── run_SVM_rejection.py │ │ ├── run_SingleR.R │ │ ├── run_kNN50.py │ │ ├── run_kNN9.py │ │ ├── run_moana.py │ │ ├── run_scID.R │ │ ├── run_scPred.R │ │ ├── run_scVI.py │ │ ├── run_scmap.R │ │ ├── run_scmapcell.R │ │ ├── run_scmapcluster.R │ │ ├── run_scmaptotal.R │ │ └── run_singleCellNet.R │ ├── Snakefile │ ├── evaluate.R │ ├── example.config.yml │ └── rank_gene_dropouts.py ├── evaluate.R └── rank_gene_dropouts.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: Cross_Validation.R ================================================ Cross_Validation <- function(LabelsPath, col_Index = 1,OutputDir){ " Cross_Validation Function returns train and test indices for 5 folds stratified across unique cell populations, also filter out cell populations with less than 10 cells. It return a 'CV_folds.RData' file which then used as input to classifiers wrappers. Parameters ---------- LabelsPath : Cell population annotations file path (.csv). col_Index : column index (integer) defining which level of annotation to use, in case of multiple cell type annotations (default is 1) OutputDir : Output directory defining the path of the exported file. " Labels <- as.matrix(read.csv(LabelsPath)) Labels <- as.vector(Labels[,col_Index]) Removed_classes <- !(table(Labels) > 10) Cells_to_Keep <- !(is.element(Labels,names(Removed_classes)[Removed_classes])) Labels <- Labels[Cells_to_Keep] # Getting training and testing Folds library(rBayesianOptimization) n_folds = 5 Folds <- KFold(Labels,nfolds = n_folds, stratified = TRUE) Test_Folds <- c(n_folds:1) Train_Idx <- list() Test_Idx <- list() for (i in c(1:length(Folds))){ Temp_Folds <- Folds Temp_Folds[Test_Folds[i]] <- NULL Train_Idx[i] <- list(unlist(Temp_Folds)) Test_Idx[i] <- Folds[Test_Folds[i]] } remove(Temp_Folds,i,Folds) setwd(OutputDir) save(n_folds,Train_Idx,Test_Idx,col_Index,Cells_to_Keep,file = 'CV_folds.RData') } ================================================ FILE: DEgenesMAST.R ================================================ DEgenesMAST <- function(Data, Labels, Normalize = FALSE, LogTransform = FALSE){ # This functions applies a differential expression test to the data using one vs all # The training data should be used a an input # The output is a matrix with marker genes where the columns are the cell populations and the rows are the top20 marker genes # This output can be rewritten to the format of the prior-knowledge-supervised classifiers and afterwards be used to classify the test set. # Data: genes X cells (rows = genes, columns = cells) # Labels: labels of the data # Normalize: the input for MAST should be cpm normalized data, # if the data is not normalized yet, this should be set to TRUE # LogTransform: the input for MAST should be logtransformed, # if the data is not logtransformed yet, this should be set to TRUE library(Seurat) if(Normalize) { Data <- apply(Data, 2, function(x) (x/sum(x))*1000000) } if(LogTransform) { Data <- log(Data+1, base = 2) } SeuObj <- CreateSeuratObject(raw.data = Data, project = "DEgenes") SeuObj <- SetIdent(SeuObj, ident.use = Labels) DEgenes <- FindAllMarkers(SeuObj, test.use = "MAST") Markers <- matrix(nrow = 20,ncol = length(unique(Labels))) colnames(Markers) <- unique(Labels) for (i in unique(Labels)){ i TempList <- DEgenes$gene[((DEgenes$cluster == i) & (DEgenes$avg_logFC > 0))] MarkerGenes <- DEgenes$p_val_adj[DEgenes$cluster == i] print(MarkerGenes[1:20]) if (length(TempList) >= 20){ Markers[,i] <- TempList[1:20] } else{ if(length(TempList) > 0){ Markers[c(1:length(TempList)),i] <- TempList } } } return(Markers) } ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2019 tabdelaal Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # A comparison of automatic cell identification methods for single-cell RNA-sequencing data We present a comprehensive evaluation of the performance of state-of-the-art classification methods, in addition to general-purpose classifiers, for automatic cell identification single cell RNA-sequencing datasets. Our goal is to provide the community with a fair evaluation of all available methods to facilitate the users’ choice as well as direct further developments to focus on the challenging aspects of automated cell type identification. (published in genome biology Sep. 2019 https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1795-z) ### Repository description We provide all the scripts to run and evaluate all classifiers, and to reproduce the results introduced in the paper. 1. 'Scripts' folder contains a wrapper function to read the data and apply certain classification method. 2. ```Cross_Validation``` R script can be used to produce training and test indices for cross validation. 3. ```rank_gene_dropouts``` Python script can be used to apply feature selection using the dropout method, and rank genes accordingly. 4. ```evaluate``` R script can be used to evaluate the prediction of a certain classifier and obtain scores such as accuracy, median F1-score and % unlabeld cells. For more details, please check function documentations. ### General Usage To benchmark and fairly evaluate the performance of different classifiers using benchmark-datasets (Filtered datasets can be downloaded from https://zenodo.org/record/3357167), apply the following steps: #### Step 1 Apply the ```Cross_Validation``` R function on the corresponding dataset to obtain fixed training and test cell indices, straitified across different cell types. For example, using the Tabula Muris (TM) dataset ```R Cross_Validation('~/TM/Labels.csv', 1, '~/TM/') ``` This command will create a ```CV_folds.RData``` file used as input in Step 2. #### Step 2 Run each classifier wrapper. For example, running scPred on TM dataset ```R run_scPred('~/TM/Filtered_TM_data.csv','~/TM/Labels.csv','~/TM/CV_folds.RData','~/Results/TM/') ``` This command will output the true and predicted cell labels as csv files, as well as the classifier computation time. #### Step 3 Evaluate the classifier prediction by ```R result <- evaluate('~/Results/TM/scPred_True_Labels.csv', '~/Results/TM/scPred_Pred_Labels.csv') ``` This command will return the corresponding accuracy, median F1-score, F1-scores for all cell populations, % unlabeled cells, and confusion matrix. ### Usage with feature selection #### Step 1 Apply the ```Cross_Validation``` R function on the corresponding dataset to obtain fixed training and test cell indices, straitified across different cell types. For example, using the Tabula Muris (TM) dataset ```R Cross_Validation('~/TM/Labels.csv', 1, '~/TM/') ``` This command will create a ```CV_folds.RData``` file used as input in Step 2 and 3. #### Step 2 Apply the ```rank_gene_dropouts``` Python script to get the genes ranking for each training fold using the dropout criteria ``` rank_gene_dropouts('~/TM/Filtered_TM_data.csv', '~/TM/CV_folds.RData', '~/TM/') ``` This command will create a ```rank_genes_dropouts.csv``` file used as input in Step 3. #### Step 3 Run each classifier wrapper. For example, running scPred on TM dataset with 1000 genes ```R run_scPred('~/TM/Filtered_TM_data.csv','~/TM/Labels.csv','~/TM/CV_folds.RData','~/Results/TM/', GeneOrderPath = '~/TM/rank_genes_dropouts.csv',NumGenes = 1000) ``` This command will output the true and predicted cell labels as csv files, as well as the classifier computation time. #### Step 4 Evaluate the classifier prediction by ```R result <- evaluate('~/Results/TM/scPred_True_Labels.csv', '~/Results/TM/scPred_Pred_Labels.csv') ``` This command will return the corresponding accuracy, median F1-score, F1-scores for all cell populations, % unlabeled cells, and confusion matrix. ### Evaluate Marker-based methods using DE genes To evaluate the marker-based methods SCINA, DigitalCellSorter and Garnett using DE genes learned from the data, you may follow these steps: #### Step 1 Apply the ```Cross_Validation``` R function on the corresponding dataset to obtain fixed training and test cell indices, straitified across different cell types. For example, using the Zheng_sorted dataset ```R Cross_Validation('~/TM/Labels.csv', 1, '~/Zheng_sorted/') ``` This command will create a ```CV_folds.RData``` file used as input in Step 2 and 3. #### Step 2 For each fold use the training data to get the DE genes using the ```DEgenesMAST``` R function, and pass these DE genes to the corresponding method, for example here we use SCINA, to obtain cell prediction for the test data. ```R load('CV_folds.RData') Data <- read.csv('~/Zheng_sorted/Filtered_DownSampled_SortedPBMC_data',row.names = 1) Labels <- as.matrix(read.csv('~/Zheng_sorted/Labels.csv')) Labels <- as.vector(Labels[,col_Index]) Data <- Data[Cells_to_Keep,] Labels <- Labels[Cells_to_Keep] for (i in c(1:n_folds)) { MarkerGenes <- DEgenesMAST(t(Data[Train_Idx[[i]],]), Labels[Train_Idx[[i]]], Normalize = TRUE, LogTransform = TRUE) ## write the MarkerGenes into a marker genes file format, depending on the tested method, for example for SCINA write.csv(MarkerGenes, 'MarkerGenes.csv') ## run the SCINA wrapper using these DE marker genes run_SCINA(Data[Test_Idx[[i]],], Labels[Test_Idx[[i]]], 'MarkerGenes.csv', '~/Results/Zheng_sorted/') } ``` ### Snakemake To support future extension of this benchmarking work with new classifiers and datasets, we provide a Snakemake workflow to automate the performed benchmarking analyses (https://github.com/tabdelaal/scRNAseq_Benchmark/tree/snakemake_and_docker). ================================================ FILE: Scripts/run_ACTINN.py ================================================ import os import numpy as np import pandas as pd import time as tm import rpy2.robjects as robjects def run_ACTINN(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): ''' run ACTINN Wrapper script to run ACTINN on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the data data = pd.read_csv(DataPath,index_col=0,sep=',') labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) labels = labels.iloc[tokeep] data = data.iloc[tokeep] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') # folder with results os.chdir(OutputDir) tot=[] truelab = [] pred = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 train=data.iloc[train_ind_i] test=data.iloc[test_ind_i] y_train=labels.iloc[train_ind_i] y_test=labels.iloc[test_ind_i] if (NumGenes > 0): feat_to_use = features.iloc[0:NumGenes,i] train = train.iloc[:,feat_to_use] test = test.iloc[:,feat_to_use] train = train.transpose() test = test.transpose() train.to_csv("train.csv") test.to_csv("test.csv") y_train.to_csv("train_lab.csv", header = False, index = True, sep = '\t') y_test.to_csv("test_lab.csv", header = False, index = True, sep = '\t') tm.sleep(60) os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_format.py -i train.csv -o train -f csv") os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_format.py -i test.csv -o test -f csv") start = tm.time() os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_predict.py -trs train.h5 -trl train_lab.csv -ts test.h5") tot.append(tm.time()-start) tm.sleep(60) truelab.extend(y_test.values) predlabels = pd.read_csv('predicted_label.txt',header=0,index_col=None, sep='\t', usecols = [1]) pred.extend(predlabels.values) truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tot_time = pd.DataFrame(tot) if (NumGenes == 0): truelab.to_csv("ACTINN_True_Labels.csv", index = False) pred.to_csv("ACTINN_Pred_Labels.csv", index = False) tot_time.to_csv("ACTINN_Total_Time.csv", index = False) else: truelab.to_csv("ACTINN_" + str(NumGenes) + "_True_Labels.csv", index = False) pred.to_csv("ACTINN_" + str(NumGenes) + "_Pred_Labels.csv", index = False) tot_time.to_csv("ACTINN_" + str(NumGenes) + "_Total_Time.csv", index = False) ================================================ FILE: Scripts/run_CHETAH.R ================================================ run_CHETAH<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ " run CHETAH Wrapper script to run CHETAH on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is NULL. " Data <- read.csv(DataPath,row.names = 1) Labels <- as.matrix(read.csv(LabelsPath)) load(CV_RDataPath) Labels <- as.vector(Labels[,col_Index]) Data <- Data[Cells_to_Keep,] Labels <- Labels[Cells_to_Keep] if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ GenesOrder = read.csv(GeneOrderPath) } ############################################################################# # CHETAH # ############################################################################# library(CHETAH) library(SingleCellExperiment) True_Labels_CHETAH <- list() Pred_Labels_CHETAH <- list() Total_Time_CHETAH <- list() Data = t(as.matrix(Data)) for (i in c(1:n_folds)){ if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ sce <- SingleCellExperiment(assays = list(counts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), colData = data.frame(celltypes = Labels[Train_Idx[[i]]])) sce_test <- SingleCellExperiment(assays = list(counts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), colData = data.frame(celltypes = Labels[Test_Idx[[i]]])) start_time <- Sys.time() sce_test <- CHETAHclassifier(input = sce_test, ref_cells = sce, n_genes = NumGenes) end_time <- Sys.time() } else{ sce <- SingleCellExperiment(assays = list(counts = Data[,Train_Idx[[i]]]), colData = data.frame(celltypes = Labels[Train_Idx[[i]]])) sce_test <- SingleCellExperiment(assays = list(counts = Data[,Test_Idx[[i]]]), colData = data.frame(celltypes = Labels[Test_Idx[[i]]])) start_time <- Sys.time() sce_test <- CHETAHclassifier(input = sce_test, ref_cells = sce) end_time <- Sys.time() } Total_Time_CHETAH[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) True_Labels_CHETAH[i] <- list(Labels[Test_Idx[[i]]]) Pred_Labels_CHETAH[i] <- list(sce_test$celltype_CHETAH) } True_Labels_CHETAH <- as.vector(unlist(True_Labels_CHETAH)) Pred_Labels_CHETAH <- as.vector(unlist(Pred_Labels_CHETAH)) Total_Time_CHETAH <- as.vector(unlist(Total_Time_CHETAH)) setwd(OutputDir) if (!is.null(GeneOrderPath) & !is.null (NumGenes)){ write.csv(True_Labels_CHETAH,paste('CHETAH_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE) write.csv(Pred_Labels_CHETAH,paste('CHETAH_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE) write.csv(Total_Time_CHETAH,paste('CHETAH_',NumGenes,'_Total_Time.csv', sep = ''),row.names = FALSE) } else{ write.csv(True_Labels_CHETAH,'CHETAH_True_Labels.csv',row.names = FALSE) write.csv(Pred_Labels_CHETAH,'CHETAH_Pred_Labels.csv',row.names = FALSE) write.csv(Total_Time_CHETAH,'CHETAH_Total_Time.csv',row.names = FALSE) } } ================================================ FILE: Scripts/run_CaSTLe.R ================================================ run_CaSTLe<-function(DataPath,LabelsPath,CV_RDataPath, OutputDir, GeneOrderPath = NULL, NumGenes = NULL){ " run CaSTLe Wrapper script to run CaSTLe on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is NULL. " Data <- read.csv(DataPath,row.names = 1) Labels <- as.matrix(read.csv(LabelsPath)) load(CV_RDataPath) Labels <- as.vector(Labels[,col_Index]) Data <- Data[Cells_to_Keep,] Labels <- Labels[Cells_to_Keep] if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ GenesOrder = read.csv(GeneOrderPath) } ############################################################################# # CaSTLe # ############################################################################# library(igraph) library(xgboost) True_Labels_Castle <- list() Pred_Labels_Castle <- list() Training_Time_Castle <- list() Testing_Time_Castle <- list() BREAKS=c(-1, 0, 1, 6, Inf) nFeatures = 100 for(i in c(1:n_folds)){ # 1. Load datasets if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ ds1 = Data[Train_Idx[[i]],as.vector(GenesOrder[c(1:NumGenes),i])+1] ds2 = Data[Test_Idx[[i]],as.vector(GenesOrder[c(1:NumGenes),i])+1] } else{ ds1 = Data[Train_Idx[[i]],] ds2 = Data[Test_Idx[[i]],] } sourceCellTypes = as.factor(Labels[Train_Idx[[i]]]) targetCellTypes = as.factor(Labels[Test_Idx[[i]]]) start_time <- Sys.time() # 2. Unify sets, excluding low expressed genes source_n_cells_counts = apply(ds1, 2, function(x) { sum(x > 0) } ) target_n_cells_counts = apply(ds2, 2, function(x) { sum(x > 0) } ) common_genes = intersect( colnames(ds1)[source_n_cells_counts>10], colnames(ds2)[target_n_cells_counts>10]) remove(source_n_cells_counts, target_n_cells_counts) ds1 = ds1[, colnames(ds1) %in% common_genes] ds2 = ds2[, colnames(ds2) %in% common_genes] ds = rbind(ds1[,common_genes], ds2[,common_genes]) isSource = c(rep(TRUE,nrow(ds1)), rep(FALSE,nrow(ds2))) remove(ds1, ds2) # 3. Highest mean in both source and target topFeaturesAvg = colnames(ds)[order(apply(ds, 2, mean), decreasing = T)] end_time <- Sys.time() Training_Time_Castle[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) start_time <- Sys.time() # for each cell - what is the most probable classification? L = length(levels(sourceCellTypes)) targetClassification = as.data.frame(matrix(rep(0,L*sum(!isSource)), nrow=L), row.names = levels(sourceCellTypes)) for (cellType in levels(sourceCellTypes)) { inSourceCellType = as.factor(ifelse(sourceCellTypes == cellType, cellType, paste0("NOT",cellType))) # 4. Highest mutual information in source topFeaturesMi = names(sort(apply(ds[isSource,],2,function(x) { compare(cut(x,breaks=BREAKS),inSourceCellType,method = "nmi") }), decreasing = T)) # 5. Top n genes that appear in both mi and avg selectedFeatures = union(head(topFeaturesAvg, nFeatures) , head(topFeaturesMi, nFeatures) ) # 6. remove correlated features tmp = cor(ds[,selectedFeatures], method = "pearson") tmp[!lower.tri(tmp)] = 0 selectedFeatures = selectedFeatures[apply(tmp,2,function(x) any(x < 0.9))] remove(tmp) # 7,8. Convert data from continous to binned dummy vars # break datasets to bins dsBins = apply(ds[, selectedFeatures], 2, cut, breaks= BREAKS) # use only bins with more than one value nUniq = apply(dsBins, 2, function(x) { length(unique(x)) }) # convert to dummy vars ds0 = model.matrix(~ . , as.data.frame(dsBins[,nUniq>1])) remove(dsBins, nUniq) cat(paste0("

Classifier for ",cellType,"

")) inTypeSource = sourceCellTypes == cellType # 9. Classify xg=xgboost(data=ds0[isSource,] , label=inTypeSource, objective="binary:logistic", eta=0.7 , nthread=1, nround=20, verbose=0, gamma=0.001, max_depth=5, min_child_weight=10) # 10. Predict inTypeProb = predict(xg, ds0[!isSource, ]) targetClassification[cellType,] = inTypeProb } end_time <- Sys.time() Testing_Time_Castle[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) True_Labels_Castle[i] <- list(Labels[Test_Idx[[i]]]) Pred_Labels_Castle[i] <- list(rownames(targetClassification)[apply(targetClassification,2,which.max)]) } True_Labels_Castle <- as.vector(unlist(True_Labels_Castle)) Pred_Labels_Castle <- as.vector(unlist(Pred_Labels_Castle)) Training_Time_Castle <- as.vector(unlist(Training_Time_Castle)) Testing_Time_Castle <- as.vector(unlist(Testing_Time_Castle)) if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ write.csv(True_Labels_Castle,paste('True_Labels_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE) write.csv(Pred_Labels_Castle,paste('Pred_Labels_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE) write.csv(Training_Time_Castle,paste('Training_Time_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE) write.csv(Testing_Time_Castle,paste('Testing_Time_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE) } else{ write.csv(True_Labels_Castle,'True_Labels_CaSTLe.csv',row.names = FALSE) write.csv(Pred_Labels_Castle,'Pred_Labels_CaSTLe.csv',row.names = FALSE) write.csv(Training_Time_Castle,'Training_Time_CaSTLe.csv',row.names = FALSE) write.csv(Testing_Time_Castle,'Testing_Time_CaSTLe.csv',row.names = FALSE) } } ================================================ FILE: Scripts/run_Cell_BLAST.py ================================================ import os import time as tm import pandas as pd import warnings warnings.filterwarnings("ignore") import tensorflow as tf tf.logging.set_verbosity(0) import Cell_BLAST as cb import numpy as np from numpy import genfromtxt as gft import rpy2.robjects as robjects def run_Cell_BLAST(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): ''' run Cell_BLAST Wrapper script to run Cell_BLAST on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') # read the data and labels data_old = cb.data.ExprDataSet.read_table(DataPath,orientation="cg", sep=",", index_col = 0, header = 0, sparsify = True).normalize() labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) data = cb.data.ExprDataSet(data_old.exprs[tokeep],data_old.obs.iloc[tokeep],data_old.var,data_old.uns) labels = gft(LabelsPath, dtype = "str", skip_header = 1, delimiter = ",", usecols = col) labels = labels[tokeep] os.chdir(OutputDir) truelab = [] pred = [] tr_time = [] ts_time = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 train=data[train_ind_i,:] test=data[test_ind_i,:] y_train = labels[train_ind_i] y_test = labels[test_ind_i] if (NumGenes > 0): feat_to_use = features.iloc[0:NumGenes,i] train = train[:,feat_to_use] test = test[:,feat_to_use] train.obs['cell_type'] = y_train start = tm.time() # reduce dimensions num_epoch = 50 models = [] for j in range(4): models.append(cb.directi.fit_DIRECTi(train, epoch=num_epoch, patience=10, random_seed = j, path="%d" % j)) # train model blast = cb.blast.BLAST(models, train).build_empirical() tr_time.append(tm.time()-start) # predict labels start = tm.time() test_pred = blast.query(test).annotate('cell_type') ts_time.append(tm.time()-start) truelab.extend(y_test) pred.extend(test_pred.values) #write results truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) if (NumGenes == 0): truelab.to_csv("Cell_BLAST_True_Labels.csv", index = False) pred.to_csv("Cell_BLAST_Pred_Labels.csv", index = False) tr_time.to_csv("Cell_BLAST_Training_Time.csv", index = False) ts_time.to_csv("Cell_BLAST_Testing_Time.csv", index = False) else: truelab.to_csv("Cell_BLAST_" + str(NumGenes) + "_True_Labels.csv", index = False) pred.to_csv("Cell_BLAST_" + str(NumGenes) + "_Pred_Labels.csv", index = False) tr_time.to_csv("Cell_BLAST_" + str(NumGenes) + "_Training_Time.csv", index = False) ts_time.to_csv("Cell_BLAST_" + str(NumGenes) + "_Testing_Time.csv", index = False) ================================================ FILE: Scripts/run_DigitalCellSorter.py ================================================ import numpy as np import pandas as pd import scripts.DigitalCellSorter as DigitalCellSorter import os import time as tm import rpy2.robjects as robjects def run_DigitalCellSorter(DataPath, LabelsPath, CV_RDataPath, GeneListPath, OutputDir, GeneOrderPath = "", NumGenes = 0): ''' run DigitalCellSorter Wrapper script to run DigitalCellSorter on a benchmark dataset using a predefined genelist, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. GeneListPath : Data file path to the genest. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 # read the data data = pd.read_csv(DataPath,index_col=0,sep=',') data = data.iloc[tokeep] truelab = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) truelab = truelab.iloc[tokeep] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') feat_to_use = features.iloc[0:NumGenes,0] data = data.iloc[:,feat_to_use] data = data.transpose() # number of different cell types in the data? n_clusters = 8 AvailableCPUsCount = 1 N_samples_for_distribution = 10000 start = tm.time() pred = DigitalCellSorter.DigitalCellSorter().Process(data, 'DigitalCellSorter_Zhang', saveDir = OutputDir, geneListFileName = GeneListPath, N_samples_for_distribution = N_samples_for_distribution, AvailableCPUsCount = AvailableCPUsCount, clusterIndex=None, clusterName=None, n_clusters=n_clusters) runtime = tm.time() - start os.chdir(OutputDir) results = pd.read_excel('DigitalCellSorter_Zhang_voting.xlsx',header=0,index_col=None, usecols=[11]) prediction = np.zeros(np.shape(pred), dtype='>U10') for i in range(len(results)): prediction[np.where(pred == i)] = results.values[i] prediction = pd.DataFrame(prediction) if (NumGenes == 0): truelab.to_csv("DigitalCellSorter_True_Labels.csv", index = False) prediction.to_csv("DigitalCellSorter_Pred_Labels.csv", index = False) with open("DigitalCellSorter_Total_Time.csv", 'w') as f: f.write("%f\n" % runtime) else: truelab.to_csv("DigitalCellSorter_" + str(NumGenes) + "_True_Labels.csv", index = False) prediction.to_csv("DigitalCellSorter_" + str(NumGenes) + "_Pred_Labels.csv", index = False) with open("DigitalCellSorter_" + str(NumGenes) + "_Total_Time.csv", 'w') as f: f.write("%f\n" % runtime) ================================================ FILE: Scripts/run_Garnett_CV.R ================================================ run_Garnett_CV <- function(DataPath, LabelsPath, CV_RDataPath, GenesPath, MarkerPath, OutputDir, Human){ " run Garnett Wrapper script to run Garnett on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. GenesPath : Path to the file with the genenames MarkerPath : Path to the file with marker genes OutputDir : Output directory defining the path of the exported file. Human : boolean indicating whether the dataset is human (TRUE) or mouse (FALSE) " # load needed libraries library(garnett) if (Human) { library(org.Hs.eg.db) } else { library(org.Mm.eg.db) } # load the CVFile load(CV_RDataPath) # read the labels labels <- as.matrix(read.csv(LabelsPath)) labels <- as.vector(labels[,col_Index]) labels <- labels[Cells_to_Keep] # read the data mat <- read.table(DataPath, sep = ",") data <- mat[-1,-1] data <- data[Cells_to_Keep,] data <- t(data) #ensure that the genes are rows, and the cells are columns cells <- mat[-1,1] cells <- cells[Cells_to_Keep] # read the genefile fdata <- read.table(GenesPath) names(fdata) <- 'gene_short_name' row.names(fdata) <- fdata$gene_short_name fd <- new("AnnotatedDataFrame", data = fdata) true_labels <- list() pred_labels <- list() train_time <- list() test_time <- list() for (i in c(1:n_folds)){ lab_train = labels[Train_Idx[[i]]] lab_test = labels[Test_Idx[[i]]] train = data[,Train_Idx[[i]]] test = data[,Test_Idx[[i]]] cells_train = cells[Train_Idx[[i]]] cells_test = cells[Test_Idx[[i]]] pdata_train = data.frame(cells_train) pdata_test = data.frame(cells_test) row.names(train) <- row.names(fdata) row.names(test) <- row.names(fdata) colnames(train) <- row.names(pdata_train) colnames(test) <- row.names(pdata_test) pd_train <- new("AnnotatedDataFrame", data = pdata_train) pd_test <- new("AnnotatedDataFrame", data = pdata_test) pbmc_cds_train <- newCellDataSet(as(train, "dgCMatrix"), phenoData = pd_train, featureData = fd) pbmc_cds_test <- newCellDataSet(as(test, "dgCMatrix"), phenoData = pd_test, featureData = fd) pbmc_cds_train <- estimateSizeFactors(pbmc_cds_train) pbmc_cds_test <- estimateSizeFactors(pbmc_cds_test) # training start_train <- Sys.time() if (Human){ pbmc_classifier <- train_cell_classifier(cds = pbmc_cds_train, marker_file = MarkerPath, db=org.Hs.eg.db, cds_gene_id_type = "SYMBOL", num_unknown = 50, marker_file_gene_id_type = "SYMBOL") } else { pbmc_classifier <- train_cell_classifier(cds = pbmc_cds_train, marker_file = MarkerPath, db=org.Mm.eg.db, cds_gene_id_type = "SYMBOL", num_unknown = 50, marker_file_gene_id_type = "SYMBOL") } end_train <- Sys.time() train_time[i] <- as.numeric(end_train - start_train) # testing start_test <- Sys.time() if (Human) { pbmc_cds_test <- classify_cells(pbmc_cds_test, pbmc_classifier, db = org.Hs.eg.db, cluster_extend = TRUE, cds_gene_id_type = "SYMBOL") } else { pbmc_cds_test <- classify_cells(pbmc_cds_test, pbmc_classifier, db = org.Mm.eg.db, cluster_extend = TRUE, cds_gene_id_type = "SYMBOL") } end_test <- Sys.time() test_time[i] <- as.numeric(end_test - start_test) true_labels[i] <- list(lab_test) pred_labels[i] <- list(pData(pbmc_cds_test)$cluster_ext_type) } true_labels <- as.vector(unlist(true_labels)) pred_labels <- as.vector(unlist(pred_labels)) train_time <- as.vector(unlist(train_time)) test_time <- as.vector(unlist(test_time)) setwd(OutputDir) write.csv(train_time,'Garnett_CV_Testing_Time.csv',row.names = FALSE) write.csv(test_time,'Garnett_CV_Training_Time.csv',row.names = FALSE) write.csv(true_labels, 'Garnett_CV_True_Labels.csv', row.names = FALSE) write.csv(pred_labels, 'Garnett_CV_Pred_Labels.csv', row.names = FALSE) } ================================================ FILE: Scripts/run_Garnett_Pretrained.R ================================================ run_Garnett_Pretrained <- function(DataPath, LabelsPath, GenesPath, CV_RDataPath, ClassifierPath, OutputDir, Human){ " run Garnett Wrapper script to run Garnett on a benchmark dataset with a pretrained classifier, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. GenesPath : Path to the file with the genenames ClassifierPath : Path to the pretrained classifier OutputDir : Output directory defining the path of the exported file. Human : boolean indicating whether the dataset is human (TRUE) or mouse (FALSE) " # load needed libraries library(garnett) if (Human) { library(org.Hs.eg.db) } else { library(org.Mm.eg.db) } # load data, genes, and marker file load(CV_RDataPath) load(ClassifierPath) labels <- as.matrix(read.csv(LabelsPath)) labels <- labels[Cells_to_Keep] mat <- read.table(DataPath, sep = ",") data <- mat[-1,-1] data <- data[Cells_to_Keep,] data <- t(data) #ensure that the genes are rows, and the cells are columns barcodes <- mat[-1,1] pdata = data.frame(barcodes) fdata <- read.table(GenesPath) names(fdata) <- 'gene_short_name' row.names(fdata) <- fdata$gene_short_name row.names(data) <- row.names(fdata) colnames(data) <- row.names(pdata) pd <- new("AnnotatedDataFrame", data = pdata) fd <- new("AnnotatedDataFrame", data = fdata) pbmc_cds <- newCellDataSet(as(data, "dgCMatrix"), phenoData = pd, featureData = fd) start_time <- Sys.time() pbmc_cds <- estimateSizeFactors(pbmc_cds) if (Human){ pbmc_cds <- classify_cells(pbmc_cds, hsPBMC, db = org.Hs.eg.db, cluster_extend = TRUE, cds_gene_id_type = "SYMBOL") } else { pbmc_cds <- classify_cells(pbmc_cds, mmLung, db = org.Mm.eg.db, cluster_extend = TRUE, cds_gene_id_type = "SYMBOL") } end_time <- Sys.time() test_time <- as.numeric(end_time - start_time) setwd(OutputDir) write.table(pData(pbmc_cds)$cluster_ext_type, file = "Garnett_Pred_Labels.csv", append = FALSE, quote = TRUE, sep = "\t", eol = "\n", na = "NA", dec = ".", row.names = FALSE, qmethod = c("escape", "double"), fileEncoding = "") write.csv(labels,"Garnett_Pretrained_True_Labels.csv", row.names = FALSE) write.csv(test_time,'Garnett_Pretrained_Testing_Time.csv',row.names = FALSE) } ================================================ FILE: Scripts/run_LAmbDA.py ================================================ # -*- coding: utf-8 -*- """ Created on Thu May 23 13:51:15 2019 @author: Lieke """ import os import numpy as np import pandas as pd import time as tm import rpy2.robjects as robjects import tensorflow as tf import math import scipy.io as sio import optunity as opt from tensorflow.contrib.tensor_forest.python import tensor_forest from tensorflow.python.ops import resources def run_LAmbDA(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): ''' run LAmbDA classifier Wrapper script to run LAmbDA on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the data data = pd.read_csv(DataPath,index_col=0,sep=',') labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) labels = labels.iloc[tokeep] data = data.iloc[tokeep] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') # folder with results os.chdir(OutputDir) tr_time=[] ts_time=[] truelab = np.zeros([len(labels),1],dtype = int) predlab = np.zeros([len(labels),1],dtype = int) for i in range(np.squeeze(nfolds)): global X, Y, Gnp, Dnp, train, test, prt, cv test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 X = np.array(data) if (NumGenes > 0): X = np.log2(X/10+1) feat_to_use = features.iloc[0:NumGenes,i] X = X[:,feat_to_use] else: X = np.log2(np.transpose(select_feats(np.transpose(X),0.5,80))/10+1) uniq = np.unique(labels) Y = np.zeros([len(labels),len(uniq)],int) for j in range(len(uniq)): Y[np.where(labels == uniq[j])[0],j] = 1 Y = np.array(Y) Gnp = np.zeros([len(uniq),len(uniq)],int) np.fill_diagonal(Gnp,1) Gnp = np.array(Gnp) Dnp = np.ones([len(uniq),1],int) Dnp = np.array(Dnp) train_samp = int(np.floor(0.75*len(train_ind_i))) test_samp = len(train_ind_i) - train_samp perm = np.random.permutation(len(train_ind_i)) train = perm[0:train_samp] test = perm[train_samp:test_samp+1] while(np.sum(np.sum(Y[train,:],0)<5)>0): perm = np.random.permutation(X.shape[0]) train = perm[0:train_samp+1] test = perm[train_samp+1:train_samp+test_samp+1] cv = i optunity_it = 0 prt = False opt_params = None start=tm.time() opt_params, _, _ = opt.minimize(run_LAmbDA2,solver_name='sobol', gamma=[0.8,1.2], delta=[0.05,0.95], tau=[10.0,11.0], prc_cut=[20,50], bs_prc=[0.2,0.6], num_trees=[10,200], max_nodes=[100,1000], num_evals=50) tr_time.append(tm.time()-start) print("Finished training!") prt = True train = train_ind_i test = test_ind_i start=tm.time() err = run_LAmbDA2(opt_params['gamma'], opt_params['delta'], opt_params['tau'], opt_params['prc_cut'], opt_params['bs_prc'], opt_params['num_trees'], opt_params['max_nodes']) ts_time.append(tm.time()-start) tf.reset_default_graph(); predfile = 'preds_cv' + str(cv) + '.mat' truefile = 'truth_cv' + str(cv) + '.mat' pred = sio.loadmat(predfile) truth = sio.loadmat(truefile) pred = pred['preds'] truth = truth['labels'] pred_ind = np.argmax(pred,axis=1) truth_ind = np.argmax(truth,axis=1) predlab[test_ind_i,0] = pred_ind truelab[test_ind_i,0] = truth_ind truelab = pd.DataFrame(truelab) predlab = pd.DataFrame(predlab) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) if (NumGenes == 0): truelab.to_csv("LAmbDA_True_Labels.csv", index = False) predlab.to_csv("LAmbDA_Pred_Labels.csv", index = False) tr_time.to_csv("LAmbDA_Training_Time.csv", index = False) ts_time.to_csv("LAmbDA_Testing_Time.csv", index = False) else: truelab.to_csv("LAmbDA_" + str(NumGenes) + "_True_Labels.csv", index = False) predlab.to_csv("LAmbDA_" + str(NumGenes) + "_Pred_Labels.csv", index = False) tr_time.to_csv("LAmbDA_" + str(NumGenes) + "_Training_Time.csv", index = False) ts_time.to_csv("LAmbDA_" + str(NumGenes) + "_Testing_Time.csv", index = False) ##### Functions copied from LAmbDA's Github def wt_cutoff(colnum,cutoff,Gtmp,gamma): rowsums = np.sum(Gtmp,axis=1); return(math.ceil(cutoff*(math.log((max(rowsums)/rowsums[colnum])+1,2)**gamma))) def resample(prc_cut,Y,Gtmp,train,gamma): add = list() rem = list() colsums = np.sum(Y[train,:],axis=0); cutoff = math.ceil(np.percentile(colsums,prc_cut)); for i in range(len(colsums)): if colsums[i] == 0: pass elif colsums[i] < wt_cutoff(i,cutoff,Gtmp,gamma): idx = np.squeeze(np.array(np.where(Y[train,i]>=1))); choice = np.random.choice(train[idx],int(wt_cutoff(i,cutoff,Gtmp,gamma)-colsums[i])) add = add + choice.tolist(); elif colsums[i] == wt_cutoff(i,cutoff,Gtmp,gamma): pass else: idx = np.squeeze(np.array(np.where(Y[train,i]>=1))); choice = np.random.choice(train[idx],int(colsums[i]-wt_cutoff(i,cutoff,Gtmp,gamma)),replace=False) rem = rem + choice.tolist() return np.concatenate((list([val for val in train if val not in rem]),add)); def select_feats(Xtmp,num_zero_prc_cut,var_prc_cut): #********************************************************************* # remove features with many zeros num_feat_zeros = np.sum(Xtmp==0,axis=1); Xtmp = Xtmp[num_feat_zerosnp.percentile(feat_vars,var_prc_cut),:] return(Xtmp) def get_yn(predict,ys,delta,tau,output_feats): D = tf.cast(Dnp, tf.float32); G = tf.cast(Gnp, tf.float32); ys = tf.cast(ys, tf.float32); #print("start") Cm = tf.matmul(tf.transpose(tf.matmul(ys,D)),predict+0.1)/tf.reshape(tf.reduce_sum(tf.transpose(tf.matmul(ys,D)),1),(-1,1)); #print("1") mCm = tf.reshape(tf.reduce_mean(tf.cast(tf.matmul(tf.transpose(D),G)>0,tf.float32)*Cm,1),(-1,1)); #print("2") yw = tf.multiply(predict+0.1,tf.matmul(tf.matmul(ys,D),tf.pow(mCm/Cm,tau))); #print("3") ye = tf.multiply(tf.matmul(ys,G),yw); #print("4") yt = tf.matmul(ys,tf.matmul(tf.transpose(ys),ye)); #print("5") ya = (delta*yt)+((1-delta)*ye) #print("6") yn = tf.cast(tf.one_hot(tf.argmax(ya,axis=1),output_feats), dtype=tf.float32) #print("7") return(yn) def get_yi(rowsums,G2,ys): G2 = tf.cast(G2, tf.float32); ys = tf.cast(ys, tf.float32); yi = tf.cast(tf.matmul(ys,G2), dtype=tf.float32); return(yi) def run_LAmbDA2(gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes): global X, Y, Gnp, Dnp, train, test, prt, cv D = tf.cast(Dnp, tf.float32); G = tf.cast(Gnp, tf.float32); #optunity_it = optunity_it+1; num_trees = int(num_trees); max_nodes = int(max_nodes); prc_cut = int(np.ceil(prc_cut)); print("gamma=%.4f, delta=%.4f, tau=%.4f, prc_cut=%i, bs_prc=%.4f, num_trees=%i, max_nodes=%i" % (gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes)) input_feats = X.shape[1]; num_labls = G.shape.as_list(); output_feats = num_labls[1]; #print(output_feats) num_labls = num_labls[0]; rowsums = np.sum(Gnp,axis=1); train2 = resample(prc_cut, Y, Gnp, train, gamma); # Bug?? bs = int(np.ceil(bs_prc*train2.size)) xs = tf.placeholder(tf.float32, [None,input_feats]) #ys = tf.placeholder(tf.float32, [None,num_labls]) yin = tf.placeholder(tf.int32, [None]) print("Vars loaded xs and ys created") hparams = tensor_forest.ForestHParams(num_classes=output_feats, num_features=input_feats, num_trees=num_trees, max_nodes=max_nodes).fill() print("Tensor forest hparams created") forest_graph = tensor_forest.RandomForestGraphs(hparams) print("Tensor forest graph created") train_op = forest_graph.training_graph(xs, yin) loss_op = forest_graph.training_loss(xs, yin) print("Loss and train ops created") predict, _, _ = forest_graph.inference_graph(xs) print("Tensor forest variables created through predict") accuracy_op = tf.reduce_mean(tf.reduce_sum(tf.square(tf.one_hot(yin,output_feats)-predict),reduction_indices=[1])) print(tf.reduce_sum(tf.square(tf.one_hot(yin,output_feats)-predict),reduction_indices=[1])) #predict = tf.one_hot(pred); print("Lambda specific variables created") # Creating training and testing steps G2 = np.copy(Gnp); G2[rowsums>1,:] = 0; YI = np.matmul(Y,G2); YIrs = np.sum(YI,axis=1); trainI = train2[np.in1d(train2,np.where(YIrs==1))]; print("data type trainI,",trainI.dtype) testI = test[np.in1d(test,np.where(YIrs==1))]; print("trainI testI created") #init_vars=tf.global_variables_initializer() init_vars = tf.group(tf.global_variables_initializer(), resources.initialize_resources(resources.shared_resources())) sess = tf.Session() sess.run(init_vars) print("Session started") #beep = sess.run(predict,feed_dict={xs:X[1:100,:]}); #beep = sess.run(predict,feed_dict={xs:X[train2[0:bs],:]}); tensor_trainI = {xs: X[trainI, :], yin: sess.run(tf.argmax(get_yi(rowsums,G2,Y[trainI, :]),axis=1))} print("tensor_trainI made") tensor_testI = {xs: X[testI, :], yin: sess.run(tf.argmax(get_yi(rowsums,G2,Y[testI, :]),axis=1))} print("tensor_testI made") tensor_train = {xs: X[train2[0:bs], :], yin: sess.run(tf.argmax(get_yn(sess.run(predict,feed_dict={xs:X[train2[0:bs],:]}),Y[train2[0:bs], :],delta,tau,output_feats),axis=1))} print("tensor_train made") tensor_test = {xs: X[test, :], yin: sess.run(tf.argmax(get_yn(sess.run(predict,feed_dict={xs:X[test,:]}),Y[test, :],delta,tau,output_feats),axis=1))} print("tensor_test made") #********************************** #print("Loss and training steps created with sample tensors") # Setting params and initializing print("Beginning iterations") # Starting training iterations print(X.shape) for i in range(1,101): if i < 50: sess.run(train_op, feed_dict=tensor_trainI) #print("ran train op") if i % 10 == 0: print(str(sess.run(accuracy_op, feed_dict=tensor_trainI)) + ' ' + str(sess.run(accuracy_op, feed_dict=tensor_testI))) else: sess.run(train_op, feed_dict=tensor_train) if i % 10 == 0: print(str(sess.run(accuracy_op, feed_dict=tensor_train)) + ' ' + str(sess.run(accuracy_op, feed_dict=tensor_test))) elif i % 10 == 0: np.random_shuffle(train2); tensor_train = {xs: X[train2[0:bs], :], yin: sess.run(get_yn(sess.run(predict,feed_dict={xs:X[train2[0:bs],:]}),Y[train2[0:bs], :],delta,tau,output_feats))} if prt: blah = sess.run(predict, feed_dict=tensor_test); sio.savemat('preds_cv' + str(cv) + '.mat', {'preds': blah}); sio.savemat('truth_cv' + str(cv) + '.mat', {'labels': Y[test, :]}); acc = sess.run(accuracy_op, feed_dict=tensor_test) print("loss1=%.4f, gamma=%.4f, delta=%.4f, tau=%.4f, prc_cut=%i, bs_prc=%.4f, num_trees=%i, max_nodes=%i" % (acc, gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes)) tf.reset_default_graph(); return(acc) ================================================ FILE: Scripts/run_LDA.py ================================================ import os import numpy as np import pandas as pd import time as tm from sklearn.discriminant_analysis import LinearDiscriminantAnalysis import rpy2.robjects as robjects def run_LDA(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): ''' run baseline classifier: LDA Wrapper script to run LDA classifier on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the data data = pd.read_csv(DataPath,index_col=0,sep=',') labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) labels = labels.iloc[tokeep] data = data.iloc[tokeep] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') # folder with results os.chdir(OutputDir) # normalize data data = np.log1p(data) Classifier = LinearDiscriminantAnalysis() tr_time=[] ts_time=[] truelab = [] pred = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 train=data.iloc[train_ind_i] test=data.iloc[test_ind_i] y_train=labels.iloc[train_ind_i] y_test=labels.iloc[test_ind_i] if (NumGenes > 0): feat_to_use = features.iloc[0:NumGenes,i] train = train.iloc[:,feat_to_use] test = test.iloc[:,feat_to_use] start=tm.time() Classifier.fit(train, y_train) tr_time.append(tm.time()-start) start=tm.time() predicted = Classifier.predict(test) ts_time.append(tm.time()-start) truelab.extend(y_test.values) pred.extend(predicted) truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) if (NumGenes == 0): truelab.to_csv("LDA_True_Labels.csv", index = False) pred.to_csv("LDA_Pred_Labels.csv", index = False) tr_time.to_csv("LDA_Training_Time.csv", index = False) ts_time.to_csv("LDA_Testing_Time.csv", index = False) else: truelab.to_csv("LDA_" + str(NumGenes) + "_True_Labels.csv", index = False) pred.to_csv("LDA_" + str(NumGenes) + "_Pred_Labels.csv", index = False) tr_time.to_csv("LDA_" + str(NumGenes) + "_Training_Time.csv", index = False) ts_time.to_csv("LDA_" + str(NumGenes) + "_Testing_Time.csv", index = False) ================================================ FILE: Scripts/run_LDA_rejection.py ================================================ import os import numpy as np import pandas as pd import time as tm from sklearn.discriminant_analysis import LinearDiscriminantAnalysis import rpy2.robjects as robjects def run_LDA_rejection(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0, Threshold = 0.7): ''' run baseline classifier: LDA Wrapper script to run LDA classifier on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. Threshold : Threshold used when rejecting the genes, default is 0.7. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the data data = pd.read_csv(DataPath,index_col=0,sep=',') labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) labels = labels.iloc[tokeep] data = data.iloc[tokeep] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') # folder with results os.chdir(OutputDir) # normalize data data = np.log1p(data) Classifier = LinearDiscriminantAnalysis() tr_time=[] ts_time=[] truelab = [] pred = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 train=data.iloc[train_ind_i] test=data.iloc[test_ind_i] y_train=labels.iloc[train_ind_i] y_test=labels.iloc[test_ind_i] if (NumGenes > 0): feat_to_use = features.iloc[0:NumGenes,i] train = train.iloc[:,feat_to_use] test = test.iloc[:,feat_to_use] start=tm.time() Classifier.fit(train, y_train) tr_time.append(tm.time()-start) start=tm.time() predicted = Classifier.predict(test) prob = np.max(Classifier.predict_proba(test), axis = 1) unlabeled = np.where(prob < Threshold) predicted[unlabeled] = 'Unknown' ts_time.append(tm.time()-start) truelab.extend(y_test.values) pred.extend(predicted) truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) if (NumGenes == 0): truelab.to_csv("LDA_True_Labels.csv", index = False) pred.to_csv("LDA_Pred_Labels.csv", index = False) tr_time.to_csv("LDA_Training_Time.csv", index = False) ts_time.to_csv("LDA_Testing_Time.csv", index = False) else: truelab.to_csv("LDA_" + str(NumGenes) + "_True_Labels.csv", index = False) pred.to_csv("LDA_" + str(NumGenes) + "_Pred_Labels.csv", index = False) tr_time.to_csv("LDA_" + str(NumGenes) + "_Training_Time.csv", index = False) ts_time.to_csv("LDA_" + str(NumGenes) + "_Testing_Time.csv", index = False) ================================================ FILE: Scripts/run_NMC.py ================================================ import os import numpy as np import pandas as pd import time as tm from sklearn.neighbors import NearestCentroid import rpy2.robjects as robjects def run_NMC(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): ''' run baseline classifier: NMC Wrapper script to run a NMC classifier on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the data data = pd.read_csv(DataPath,index_col=0,sep=',') labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) labels = labels.iloc[tokeep] data = data.iloc[tokeep] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') # folder with results os.chdir(OutputDir) # normalize data data = np.log1p(data) Classifier = NearestCentroid() tr_time=[] ts_time=[] truelab = [] pred = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 train=data.iloc[train_ind_i] test=data.iloc[test_ind_i] y_train=labels.iloc[train_ind_i] y_test=labels.iloc[test_ind_i] if (NumGenes > 0): feat_to_use = features.iloc[0:NumGenes,i] train = train.iloc[:,feat_to_use] test = test.iloc[:,feat_to_use] start=tm.time() Classifier.fit(train, y_train) tr_time.append(tm.time()-start) start=tm.time() predicted = Classifier.predict(test) ts_time.append(tm.time()-start) truelab.extend(y_test.values) pred.extend(predicted) truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) if (NumGenes == 0): truelab.to_csv("NMC_True_Labels.csv", index = False) pred.to_csv("NMC_Pred_Labels.csv", index = False) tr_time.to_csv("NMC_Training_Time.csv", index = False) ts_time.to_csv("NMC_Testing_Time.csv", index = False) else: truelab.to_csv("NMC_" + str(NumGenes) + "_True_Labels.csv", index = False) pred.to_csv("NMC_" + str(NumGenes) + "_Pred_Labels.csv", index = False) tr_time.to_csv("NMC_" + str(NumGenes) + "_Training_Time.csv", index = False) ts_time.to_csv("NMC_" + str(NumGenes) + "_Testing_Time.csv", index = False) ================================================ FILE: Scripts/run_RF.py ================================================ import os import numpy as np import pandas as pd import time as tm from sklearn.ensemble import RandomForestClassifier import rpy2.robjects as robjects def run_RF(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): ''' run baseline classifier: RF Wrapper script to run a RF classifier with 50 trees on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the data data = pd.read_csv(DataPath,index_col=0,sep=',') labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) labels = labels.iloc[tokeep] data = data.iloc[tokeep] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') # folder with results os.chdir(OutputDir) # normalize data data = np.log1p(data) Classifier = RandomForestClassifier(n_estimators = 50) tr_time=[] ts_time=[] truelab = [] pred = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 train=data.iloc[train_ind_i] test=data.iloc[test_ind_i] y_train=labels.iloc[train_ind_i] y_test=labels.iloc[test_ind_i] if (NumGenes > 0): feat_to_use = features.iloc[0:NumGenes,i] train = train.iloc[:,feat_to_use] test = test.iloc[:,feat_to_use] start=tm.time() Classifier.fit(train, y_train) tr_time.append(tm.time()-start) start=tm.time() predicted = Classifier.predict(test) ts_time.append(tm.time()-start) truelab.extend(y_test.values) pred.extend(predicted) truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) if (NumGenes == 0): truelab.to_csv("RF_True_Labels.csv", index = False) pred.to_csv("RF_Pred_Labels.csv", index = False) tr_time.to_csv("RF_Training_Time.csv", index = False) ts_time.to_csv("RF_Testing_Time.csv", index = False) else: truelab.to_csv("RF_" + str(NumGenes) + "_True_Labels.csv", index = False) pred.to_csv("RF_" + str(NumGenes) + "_Pred_Labels.csv", index = False) tr_time.to_csv("RF_" + str(NumGenes) + "_Training_Time.csv", index = False) ts_time.to_csv("RF_" + str(NumGenes) + "_Testing_Time.csv", index = False) ================================================ FILE: Scripts/run_SCINA.R ================================================ run_SCINA<-function(DataPath,LabelsPath,GeneSigPath,OutputDir){ " run SCINA Wrapper script to run SCINA on a benchmark dataset, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). GeneSigPath : Cell type marker genes file path (.csv) OutputDir : Output directory defining the path of the exported file. " Data <- read.csv(DataPath,row.names = 1) Labels <- as.vector(as.matrix(read.csv(LabelsPath))) Data <- Data[is.element(Labels,c('CD14+ Monocyte','CD19+ B','CD56+ NK')),] Labels <- Labels[is.element(Labels,c('CD14+ Monocyte','CD19+ B','CD56+ NK'))] Labels[Labels == 'CD14+ Monocyte'] <- 'CD14_Monocyte' Labels[Labels == 'CD19+ B'] <- 'CD19_B' Labels[Labels == 'CD56+ NK'] <- 'CD56_NK' ############################################################################# # SCINA # ############################################################################# library(SCINA) Signature_Genes <- preprocess.signatures(GeneSigPath) True_Labels_SCINA <- list() Pred_Labels_SCINA <- list() Total_Time_SCINA <- list() library(preprocessCore) Data = t(as.matrix(Data)) Data=log(Data+1) Data[]=normalize.quantiles(Data) start_time <- Sys.time() results = SCINA(Data, Signature_Genes) end_time <- Sys.time() True_Labels_SCINA <- Labels Pred_Labels_SCINA <- results$cell_labels Total_Time_SCINA <- as.numeric(difftime(end_time,start_time,units = 'secs')) setwd(OutputDir) write.csv(True_Labels_SCINA,'SCINA_True_Labels.csv',row.names = FALSE) write.csv(Pred_Labels_SCINA,'SCINA_Pred_Labels.csv',row.names = FALSE) write.csv(Total_Time_SCINA,'SCINA_Total_Time.csv',row.names = FALSE) } ================================================ FILE: Scripts/run_SVM.py ================================================ import os import numpy as np import pandas as pd import time as tm from sklearn.svm import LinearSVC import rpy2.robjects as robjects def run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): ''' run baseline classifier: SVM Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the data data = pd.read_csv(DataPath,index_col=0,sep=',') labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) labels = labels.iloc[tokeep] data = data.iloc[tokeep] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') # folder with results os.chdir(OutputDir) # normalize data data = np.log1p(data) Classifier = LinearSVC() tr_time=[] ts_time=[] truelab = [] pred = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 train=data.iloc[train_ind_i] test=data.iloc[test_ind_i] y_train=labels.iloc[train_ind_i] y_test=labels.iloc[test_ind_i] if (NumGenes > 0): feat_to_use = features.iloc[0:NumGenes,i] train = train.iloc[:,feat_to_use] test = test.iloc[:,feat_to_use] start=tm.time() Classifier.fit(train, y_train) tr_time.append(tm.time()-start) start=tm.time() predicted = Classifier.predict(test) ts_time.append(tm.time()-start) truelab.extend(y_test.values) pred.extend(predicted) truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) if (NumGenes == 0): truelab.to_csv("SVM_True_Labels.csv", index = False) pred.to_csv("SVM_Pred_Labels.csv", index = False) tr_time.to_csv("SVM_Training_Time.csv", index = False) ts_time.to_csv("SVM_Testing_Time.csv", index = False) else: truelab.to_csv("SVM_" + str(NumGenes) + "_True_Labels.csv", index = False) pred.to_csv("SVM_" + str(NumGenes) + "_Pred_Labels.csv", index = False) tr_time.to_csv("SVM_" + str(NumGenes) + "_Training_Time.csv", index = False) ts_time.to_csv("SVM_" + str(NumGenes) + "_Testing_Time.csv", index = False) ================================================ FILE: Scripts/run_SVM_rejection.py ================================================ import os import numpy as np import pandas as pd import time as tm from sklearn.svm import LinearSVC import rpy2.robjects as robjects from sklearn.calibration import CalibratedClassifierCV def run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0, Threshold = 0.7): ''' run baseline classifier: SVM Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. Threshold : Threshold used when rejecting the cells, default is 0.7. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the data data = pd.read_csv(DataPath,index_col=0,sep=',') labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) labels = labels.iloc[tokeep] data = data.iloc[tokeep] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') # folder with results os.chdir(OutputDir) # normalize data data = np.log1p(data) Classifier = LinearSVC() clf = CalibratedClassifierCV(Classifier) tr_time=[] ts_time=[] truelab = [] pred = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 train=data.iloc[train_ind_i] test=data.iloc[test_ind_i] y_train=labels.iloc[train_ind_i] y_test=labels.iloc[test_ind_i] if (NumGenes > 0): feat_to_use = features.iloc[0:NumGenes,i] train = train.iloc[:,feat_to_use] test = test.iloc[:,feat_to_use] start=tm.time() clf.fit(train, y_train) tr_time.append(tm.time()-start) start=tm.time() predicted = clf.predict(test) prob = np.max(clf.predict_proba(test), axis = 1) unlabeled = np.where(prob < Threshold) predicted[unlabeled] = 'Unknown' ts_time.append(tm.time()-start) truelab.extend(y_test.values) pred.extend(predicted) truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) if (NumGenes == 0): truelab.to_csv("SVM_True_Labels.csv", index = False) pred.to_csv("SVM_Pred_Labels.csv", index = False) tr_time.to_csv("SVM_Training_Time.csv", index = False) ts_time.to_csv("SVM_Testing_Time.csv", index = False) else: truelab.to_csv("SVM_" + str(NumGenes) + "_True_Labels.csv", index = False) pred.to_csv("SVM_" + str(NumGenes) + "_Pred_Labels.csv", index = False) tr_time.to_csv("SVM_" + str(NumGenes) + "_Training_Time.csv", index = False) ts_time.to_csv("SVM_" + str(NumGenes) + "_Testing_Time.csv", index = False) ================================================ FILE: Scripts/run_SingleR.R ================================================ run_SingleR<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ " run SingleR Wrapper script to run SingleR on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is NULL. " Data <- read.csv(DataPath,row.names = 1) Labels <- as.matrix(read.csv(LabelsPath)) load(CV_RDataPath) Labels <- as.vector(Labels[,col_Index]) Data <- Data[Cells_to_Keep,] Labels <- Labels[Cells_to_Keep] if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ GenesOrder = read.csv(GeneOrderPath) } ############################################################################# # SingleR # ############################################################################# library(SingleR) library(Seurat) True_Labels_SingleR <- list() Pred_Labels_SingleR <- list() Total_Time_SingleR <- list() Data = t(as.matrix(Data)) for (i in c(1:n_folds)){ if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ start_time <- Sys.time() singler = SingleR(method = "single", Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]], Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]], Labels[Train_Idx[[i]]], numCores = 1) end_time <- Sys.time() } else{ start_time <- Sys.time() singler = SingleR(method = "single", Data[,Test_Idx[[i]]], Data[,Train_Idx[[i]]], Labels[Train_Idx[[i]]], numCores = 1) end_time <- Sys.time() } Total_Time_SingleR[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) True_Labels_SingleR[i] <- list(Labels[Test_Idx[[i]]]) Pred_Labels_SingleR[i] <- list(as.vector(singler$labels)) } True_Labels_SingleR <- as.vector(unlist(True_Labels_SingleR)) Pred_Labels_SingleR <- as.vector(unlist(Pred_Labels_SingleR)) Total_Time_SingleR <- as.vector(unlist(Total_Time_SingleR)) setwd(OutputDir) if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ write.csv(True_Labels_SingleR,paste('SingleR_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE) write.csv(Pred_Labels_SingleR,paste('SingleR_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE) write.csv(Total_Time_SingleR,paste('SingleR_',NumGenes,'_Total_Time.csv', sep = ''),row.names = FALSE) } else{ write.csv(True_Labels_SingleR,'SingleR_True_Labels.csv',row.names = FALSE) write.csv(Pred_Labels_SingleR,'SingleR_Pred_Labels.csv',row.names = FALSE) write.csv(Total_Time_SingleR,'SingleR_Total_Time.csv',row.names = FALSE) } } ================================================ FILE: Scripts/run_kNN50.py ================================================ import os import numpy as np import pandas as pd import time as tm from sklearn.neighbors import KNeighborsClassifier import rpy2.robjects as robjects def run_kNN50(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): ''' run baseline classifiers: kNN Wrapper script to run kNN (with k = 50) classifier on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the data data = pd.read_csv(DataPath,index_col=0,sep=',') labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) labels = labels.iloc[tokeep] data = data.iloc[tokeep] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') # folder with results os.chdir(OutputDir) # normalize data data = np.log1p(data) Classifier = KNeighborsClassifier(n_neighbors=50) tr_time=[] ts_time=[] truelab = [] pred = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 train=data.iloc[train_ind_i] test=data.iloc[test_ind_i] y_train=labels.iloc[train_ind_i] y_test=labels.iloc[test_ind_i] if (NumGenes > 0): feat_to_use = features.iloc[0:NumGenes,i] train = train.iloc[:,feat_to_use] test = test.iloc[:,feat_to_use] start=tm.time() Classifier.fit(train, y_train) tr_time.append(tm.time()-start) start=tm.time() predicted = Classifier.predict(test) ts_time.append(tm.time()-start) truelab.extend(y_test.values) pred.extend(predicted) truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) if (NumGenes == 0): truelab.to_csv("kNN50_True_Labels.csv", index = False) pred.to_csv("kNN50_Pred_Labels.csv", index = False) tr_time.to_csv("kNN50_Training_Time.csv", index = False) ts_time.to_csv("kNN50_Testing_Time.csv", index = False) else: truelab.to_csv("kNN50_" + str(NumGenes) + "_True_Labels.csv", index = False) pred.to_csv("kNN50_" + str(NumGenes) + "_Pred_Labels.csv", index = False) tr_time.to_csv("kNN50_" + str(NumGenes) + "_Training_Time.csv", index = False) ts_time.to_csv("kNN50_" + str(NumGenes) + "_Testing_Time.csv", index = False) ================================================ FILE: Scripts/run_kNN9.py ================================================ import os import numpy as np import pandas as pd import time as tm from sklearn.neighbors import KNeighborsClassifier import rpy2.robjects as robjects def run_kNN9(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): ''' run baseline classifiers: kNN Wrapper script to run kNN (with k = 9) classifier on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the data data = pd.read_csv(DataPath,index_col=0,sep=',') labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) labels = labels.iloc[tokeep] data = data.iloc[tokeep] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') # folder with results os.chdir(OutputDir) # normalize data data = np.log1p(data) Classifier = KNeighborsClassifier(n_neighbors=9) tr_time=[] ts_time=[] truelab = [] pred = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 train=data.iloc[train_ind_i] test=data.iloc[test_ind_i] y_train=labels.iloc[train_ind_i] y_test=labels.iloc[test_ind_i] if (NumGenes > 0): feat_to_use = features.iloc[0:NumGenes,i] train = train.iloc[:,feat_to_use] test = test.iloc[:,feat_to_use] start=tm.time() Classifier.fit(train, y_train) tr_time.append(tm.time()-start) start=tm.time() predicted = Classifier.predict(test) ts_time.append(tm.time()-start) truelab.extend(y_test.values) pred.extend(predicted) truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) if (NumGenes == 0): truelab.to_csv("kNN9_True_Labels.csv", index = False) pred.to_csv("kNN9_Pred_Labels.csv", index = False) tr_time.to_csv("kNN9_Training_Time.csv", index = False) ts_time.to_csv("kNN9_Testing_Time.csv", index = False) else: truelab.to_csv("kNN9_" + str(NumGenes) + "_True_Labels.csv", index = False) pred.to_csv("kNN9_" + str(NumGenes) + "_Pred_Labels.csv", index = False) tr_time.to_csv("kNN9_" + str(NumGenes) + "_Training_Time.csv", index = False) ts_time.to_csv("kNN9_" + str(NumGenes) + "_Testing_Time.csv", index = False) ================================================ FILE: Scripts/run_moana.py ================================================ import os import pandas as pd import numpy as np from moana.core import ExpMatrix from moana.classify import CellTypeClassifier import time as tm import rpy2.robjects as robjects def run_moana(DataPath, LabelsPath, ClassifierPath, OutputDir, GeneOrderPath = "", NumGenes = 0): ''' run moana Wrapper script to run moana on a benchmark dataset with a pretrained classifier, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. ClassifierPath : Data file path to the pretrained classifier. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # # read the Rdata file # robjects.r['load'](CV_RDataPath) # # tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') # col = np.array(robjects.r['col_Index'], dtype = 'int') # col = col - 1 matrix = ExpMatrix.read_tsv(DataPath, sep = ',') # matrix = matrix.iloc[tokeep] truelab = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',') # truelab = truelab.iloc[tokeep] ct_old = ['CD19+ B','CD14+ Monocyte','CD4+/CD45RA+/CD25- Naive T','CD4+/CD45RO+ Memory','CD8+/CD45RA+ Naive Cytotoxic','Dendritic', 'CD56+ NK'] ct_new = ['B cells','CD14+ monocytes','Naive CD4+ T cells','Memory CD4+ T cells','Naive CD8+ T cells','Dendritic cells','NK cells'] tokeep2 = np.isin(truelab,ct_old) truelab = truelab[tokeep2] print(len(truelab)) matrix = matrix.iloc[np.squeeze(tokeep2)] for i in range(len(ct_old)): truelab.iloc[truelab == ct_old[i]] = ct_new[i] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') feat_to_use = features.iloc[0:NumGenes,0] matrix = matrix.iloc[:,feat_to_use] data = ExpMatrix(X = np.transpose(matrix.X), genes = matrix.cells, cells = matrix.genes) data.genes.name = 'Genes' data.cells.name = 'Cells' data.index.name = 'Genes' data.columns.name = 'Cells' clf = CellTypeClassifier.read_pickle(ClassifierPath) start = tm.time() predictions = clf.predict(data) runtime = tm.time() - start np.asarray(predictions) pred = pd.DataFrame(predictions) os.chdir(OutputDir) if (NumGenes == 0): truelab.to_csv("moana_True_Labels.csv", index = False) pred.to_csv("moana_Pred_Labels.csv", index = False) with open("moana_Total_Time.csv", 'w') as f: f.write("%f\n" % runtime) else: truelab.to_csv("moana_" + str(NumGenes) + "_True_Labels.csv", index = False) pred.to_csv("moana_" + str(NumGenes) + "_Pred_Labels.csv", index = False) with open("moana_" + str(NumGenes) + "_Total_Time.csv", 'w') as f: f.write("%f\n" % runtime) ================================================ FILE: Scripts/run_scID.R ================================================ run_scID<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ " run scID Wrapper script to run scID on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is NULL. " Data <- read.csv(DataPath,row.names = 1) Labels <- as.matrix(read.csv(LabelsPath)) load(CV_RDataPath) Labels <- as.vector(Labels[,col_Index]) Data <- Data[Cells_to_Keep,] Labels <- Labels[Cells_to_Keep] if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ GenesOrder = read.csv(GeneOrderPath) } ############################################################################# # scID # ############################################################################# library(scID) library(Seurat) True_Labels_scID <- list() Pred_Labels_scID <- list() Total_Time_scID <- list() Data = t(as.matrix(Data)) for (i in c(1:n_folds)){ if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ Train_Labels <- list(Labels[Train_Idx[[i]]]) names(Train_Labels[[1]]) <- colnames(Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]) start_time <- Sys.time() scID_output <- scid_multiclass(Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]], Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]], Train_Labels[[1]]) end_time <- Sys.time() } else{ Train_Labels <- list(Labels[Train_Idx[[i]]]) names(Train_Labels[[1]]) <- colnames(Data[,Train_Idx[[i]]]) start_time <- Sys.time() scID_output <- scid_multiclass(Data[,Test_Idx[[i]]], Data[,Train_Idx[[i]]], Train_Labels[[1]]) end_time <- Sys.time() } Total_Time_scID[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) True_Labels_scID[i] <- list(Labels[Test_Idx[[i]]]) Pred_Labels_scID[i] <- list(as.vector(scID_output$labels)) } True_Labels_scID <- as.vector(unlist(True_Labels_scID)) Pred_Labels_scID <- as.vector(unlist(Pred_Labels_scID)) Total_Time_scID <- as.vector(unlist(Total_Time_scID)) setwd(OutputDir) if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ write.csv(True_Labels_scID,paste('scID_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE) write.csv(Pred_Labels_scID,paste('scID_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE) write.csv(Total_Time_scID,paste('scID_',NumGenes,'_Total_Time.csv', sep = ''),row.names = FALSE) } else{ write.csv(True_Labels_scID,'scID_True_Labels.csv',row.names = FALSE) write.csv(Pred_Labels_scID,'scID_Pred_Labels.csv',row.names = FALSE) write.csv(Total_Time_scID,'scID_Total_Time.csv',row.names = FALSE) } } ================================================ FILE: Scripts/run_scPred.R ================================================ run_scPred<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ " run scPred Wrapper script to run scPred on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is NULL. " Data <- read.csv(DataPath,row.names = 1) Labels <- as.matrix(read.csv(LabelsPath)) load(CV_RDataPath) Labels <- as.vector(Labels[,col_Index]) Data <- Data[Cells_to_Keep,] Labels <- Labels[Cells_to_Keep] if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ GenesOrder = read.csv(GeneOrderPath) } ############################################################################# # scPred # ############################################################################# library(scPred) library(tidyverse) library(SingleCellExperiment) True_Labels_scPred <- list() Pred_Labels_scPred <- list() Training_Time_scPred <- list() Testing_Time_scPred <- list() Data = t(as.matrix(Data)) for (i in c(1:n_folds)){ if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]])) sce_counts <- normcounts(sce) sce_cpm <- apply(sce_counts, 2, function(x) (x/sum(x))*1000000) sce_metadata <- as.data.frame(colData(sce)) sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]])) sce_counts_test <- normcounts(sce_test) sce_cpm_test <- apply(sce_counts_test, 2, function(x) (x/sum(x))*1000000) sce_metadata_test <- as.data.frame(colData(sce_test)) } else{ sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]])) sce_counts <- normcounts(sce) sce_cpm <- apply(sce_counts, 2, function(x) (x/sum(x))*1000000) sce_metadata <- as.data.frame(colData(sce)) sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]])) sce_counts_test <- normcounts(sce_test) sce_cpm_test <- apply(sce_counts_test, 2, function(x) (x/sum(x))*1000000) sce_metadata_test <- as.data.frame(colData(sce_test)) } # scPred Training start_time <- Sys.time() set.seed(1234) scp <- eigenDecompose(sce_cpm) scPred::metadata(scp) <- sce_metadata scp <- getFeatureSpace(scp, pVar = 'cell_type1') # plotEigen(scp, group = 'cell_type1') scp <- trainModel(scp) # plotTrainProbs(scp) end_time <- Sys.time() Training_Time_scPred[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) # scPred Prediction start_time <- Sys.time() scp <- scPredict(scp,newData = sce_cpm_test) end_time <- Sys.time() Testing_Time_scPred[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) True_Labels_scPred[i] <- list(Labels[Test_Idx[[i]]]) Pred_Labels_scPred[i] <- list(getPredictions(scp)$predClass) } True_Labels_scPred <- as.vector(unlist(True_Labels_scPred)) Pred_Labels_scPred <- as.vector(unlist(Pred_Labels_scPred)) Training_Time_scPred <- as.vector(unlist(Training_Time_scPred)) Testing_Time_scPred <- as.vector(unlist(Testing_Time_scPred)) setwd(OutputDir) if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ write.csv(True_Labels_scPred,paste('scPred_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE) write.csv(Pred_Labels_scPred,paste('scPred_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE) write.csv(Training_Time_scPred,paste('scPred_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE) write.csv(Testing_Time_scPred,paste('scPred_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE) } else{ write.csv(True_Labels_scPred,'scPred_True_Labels.csv',row.names = FALSE) write.csv(Pred_Labels_scPred,'scPred_Pred_Labels.csv',row.names = FALSE) write.csv(Training_Time_scPred,'scPred_Training_Time.csv',row.names = FALSE) write.csv(Testing_Time_scPred,'scPred_Testing_Time.csv',row.names = FALSE) } } ================================================ FILE: Scripts/run_scVI.py ================================================ from scvi.dataset import CsvDataset import os import numpy as np import pandas as pd from scvi.models import SCANVI from scvi.inference import SemiSupervisedTrainer import time as tm import rpy2.robjects as robjects def run_scVI(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): ''' run scVI Wrapper script to run scVI on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the data data = pd.read_csv(DataPath,index_col=0,sep=',') labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) labels = labels.iloc[tokeep] data = data.iloc[tokeep] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') os.chdir(OutputDir) if (NumGenes == 0): #save labels as csv file with header and index column labels.to_csv('Labels_scvi.csv') data.to_csv('Data_scvi.csv') train = CsvDataset('Data_scvi.csv', save_path = "", sep = ",", labels_file = "Labels_scvi.csv", gene_by_cell = False) ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels) trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5) n_epochs = 200 truelab = [] pred = [] tr_time = [] ts_time = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 if (NumGenes > 0): feat_to_use = features.iloc[0:NumGenes,i] data2 = data.iloc[:,feat_to_use] labels.to_csv('Labels_scvi.csv') data2.to_csv('Data_scvi.csv') train = CsvDataset('Data_scvi.csv', save_path = "", sep = ",", labels_file = "Labels_scvi.csv", gene_by_cell = False, new_n_genes = False) ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels) trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5) trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(indices=(train_ind_i).ravel(), shuffle = False) trainer_scanvi.labelled_set.to_monitor = ['ll','accuracy'] trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(indices=(test_ind_i).ravel(), shuffle = False) trainer_scanvi.unlabelled_set.to_monitor = ['ll','accuracy'] start = tm.time() trainer_scanvi.train(n_epochs) tr_time.append(tm.time()-start) ## labels of test set are in y_pred ## labels are returned in numbers, should be mapped back to the real labels ## indices are permutated start = tm.time() y_true, y_pred = trainer_scanvi.unlabelled_set.compute_predictions() ts_time.append(tm.time()-start) truelab.extend(y_true) pred.extend(y_pred) #write results truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) if (NumGenes == 0): truelab.to_csv("scVI_True_Labels.csv", index = False) pred.to_csv("scVI_Pred_Labels.csv", index = False) tr_time.to_csv("scVI_Training_Time.csv", index = False) ts_time.to_csv("scVI_Testing_Time.csv", index = False) else: truelab.to_csv("scVI_" + str(NumGenes) + "_True_Labels.csv", index = False) pred.to_csv("scVI_" + str(NumGenes) + "_Pred_Labels.csv", index = False) tr_time.to_csv("scVI_" + str(NumGenes) + "_Training_Time.csv", index = False) ts_time.to_csv("scVI_" + str(NumGenes) + "_Testing_Time.csv", index = False) ================================================ FILE: Scripts/run_scmap.R ================================================ run_scmap <- function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ " run scmap Wrapper script to run scmap on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is NULL. " Data <- read.csv(DataPath,row.names = 1) Labels <- as.matrix(read.csv(LabelsPath)) load(CV_RDataPath) Labels <- as.vector(Labels[,col_Index]) Data <- Data[Cells_to_Keep,] Labels <- Labels[Cells_to_Keep] if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ GenesOrder = read.csv(GeneOrderPath) } ############################################################################# # scmap # ############################################################################# library(scmap) library(SingleCellExperiment) True_Labels_scmapcluster <- list() Pred_Labels_scmapcluster <- list() True_Labels_scmapcell <- list() Pred_Labels_scmapcell <- list() Training_Time_scmapcluster <- list() Testing_Time_scmapcluster <- list() Training_Time_scmapcell <- list() Testing_Time_scmapcell <- list() Data = t(as.matrix(Data)) for (i in c(1:n_folds)){ if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]])) logcounts(sce) <- log2(normcounts(sce) + 1) # use gene names as feature symbols rowData(sce)$feature_symbol <- rownames(sce) sce <- selectFeatures(sce, n_features = NumGenes, suppress_plot = TRUE) sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]])) logcounts(sce_test) <- log2(normcounts(sce_test) + 1) rowData(sce_test)$feature_symbol <- rownames(sce_test) sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData } else{ sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]])) logcounts(sce) <- log2(normcounts(sce) + 1) # use gene names as feature symbols rowData(sce)$feature_symbol <- rownames(sce) sce <- selectFeatures(sce, suppress_plot = TRUE) sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]])) logcounts(sce_test) <- log2(normcounts(sce_test) + 1) rowData(sce_test)$feature_symbol <- rownames(sce_test) sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData } # scmap-cluster start_time <- Sys.time() sce <- indexCluster(sce) end_time <- Sys.time() Training_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) start_time <- Sys.time() scmapCluster_results <- scmapCluster(projection = sce_test,index_list = list(metadata(sce)$scmap_cluster_index)) end_time <- Sys.time() Testing_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) True_Labels_scmapcluster[i] <- list(Labels[Test_Idx[[i]]]) Pred_Labels_scmapcluster[i] <- list(scmapCluster_results$combined_labs) # scmap-cell start_time <- Sys.time() set.seed(1) sce <- indexCell(sce) end_time <- Sys.time() Training_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) start_time <- Sys.time() scmapCell_results <- scmapCell(sce_test,list(metadata(sce)$scmap_cell_index)) scmapCell_clusters <- scmapCell2Cluster(scmapCell_results,list(as.character(colData(sce)$cell_type1))) end_time <- Sys.time() Testing_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) True_Labels_scmapcell[i] <- list(Labels[Test_Idx[[i]]]) Pred_Labels_scmapcell[i] <- list(scmapCell_clusters$combined_labs) } True_Labels_scmapcluster <- as.vector(unlist(True_Labels_scmapcluster)) Pred_Labels_scmapcluster <- as.vector(unlist(Pred_Labels_scmapcluster)) True_Labels_scmapcell <- as.vector(unlist(True_Labels_scmapcell)) Pred_Labels_scmapcell <- as.vector(unlist(Pred_Labels_scmapcell)) Training_Time_scmapcluster <- as.vector(unlist(Training_Time_scmapcluster)) Testing_Time_scmapcluster <- as.vector(unlist(Testing_Time_scmapcluster)) Training_Time_scmapcell <- as.vector(unlist(Training_Time_scmapcell)) Testing_Time_scmapcell <- as.vector(unlist(Testing_Time_scmapcell)) setwd(OutputDir) if (!is.null(GeneOrderPath) & !is.null (NumGenes)){ write.csv(True_Labels_scmapcluster,paste('scmapcluster_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE) write.csv(Pred_Labels_scmapcluster,paste('scmapcluster_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE) write.csv(True_Labels_scmapcell,paste('scmapcell_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE) write.csv(Pred_Labels_scmapcell,paste('scmapcell_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE) write.csv(Training_Time_scmapcluster,paste('scmapcluster_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE) write.csv(Testing_Time_scmapcluster,paste('scmapcluster_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE) write.csv(Training_Time_scmapcell,paste('scmapcell_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE) write.csv(Testing_Time_scmapcell,paste('scmapcell_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE) } else{ write.csv(True_Labels_scmapcluster,'scmapcluster_True_Labels.csv',row.names = FALSE) write.csv(Pred_Labels_scmapcluster,'scmapcluster_Pred_Labels.csv',row.names = FALSE) write.csv(True_Labels_scmapcell,'scmapcell_True_Labels.csv',row.names = FALSE) write.csv(Pred_Labels_scmapcell,'scmapcell_Pred_Labels.csv',row.names = FALSE) write.csv(Training_Time_scmapcluster,'scmapcluster_Training_Time.csv',row.names = FALSE) write.csv(Testing_Time_scmapcluster,'scmapcluster_Testing_Time.csv',row.names = FALSE) write.csv(Training_Time_scmapcell,'scmapcell_Training_Time.csv',row.names = FALSE) write.csv(Testing_Time_scmapcell,'scmapcell_Testing_Time.csv',row.names = FALSE) } } ================================================ FILE: Scripts/run_singleCellNet.R ================================================ run_singleCellNet<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ " run singleCellNet Wrapper script to run singleCellNet on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is NULL. " Data <- read.csv(DataPath,row.names = 1) colnames(Data) <- gsub('_','.',colnames(Data), fixed = TRUE) Labels <- as.matrix(read.csv(LabelsPath)) load(CV_RDataPath) Labels <- as.vector(Labels[,col_Index]) Data <- Data[Cells_to_Keep,] Labels <- Labels[Cells_to_Keep] if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ GenesOrder = read.csv(GeneOrderPath) } ############################################################################# # singleCellNet # ############################################################################# library(singleCellNet) library(dplyr) True_Labels_singleCellNet <- list() Pred_Labels_singleCellNet <- list() Training_Time_singleCellNet <- list() Testing_Time_singleCellNet <- list() Data = t(as.matrix(Data)) # deals also with sparse matrix for(i in c(1:n_folds)){ if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ DataTrain <- Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]] DataTest <- Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]] } else{ DataTrain <- Data[,Train_Idx[[i]]] DataTest <- Data[,Test_Idx[[i]]] } start_time <- Sys.time() cgenes2<-findClassyGenes(DataTrain, data.frame(Annotation = Labels[Train_Idx[[i]]]), "Annotation") cgenesA<-cgenes2[['cgenes']] grps<-cgenes2[['grps']] DataTrain<-as.matrix(DataTrain[cgenesA,]) xpairs<-ptGetTop(DataTrain, grps, ncores = 1) pdTrain<-query_transform(DataTrain[cgenesA, ], xpairs) rf<-sc_makeClassifier(pdTrain[xpairs,], genes=xpairs, groups=grps) end_time <- Sys.time() Training_Time_singleCellNet[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) start_time <- Sys.time() DataTest<-query_transform(DataTest[cgenesA,], xpairs) classRes <-rf_classPredict(rf, DataTest) end_time <- Sys.time() Testing_Time_singleCellNet[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) True_Labels_singleCellNet[i] <- list(Labels[Test_Idx[[i]]]) Pred_Labels_singleCellNet[i] <- list((rownames(classRes)[apply(classRes,2,which.max)])[1:length(Test_Idx[[i]])]) } True_Labels_singleCellNet <- as.vector(unlist(True_Labels_singleCellNet)) Pred_Labels_singleCellNet <- as.vector(unlist(Pred_Labels_singleCellNet)) Training_Time_singleCellNet <- as.vector(unlist(Training_Time_singleCellNet)) Testing_Time_singleCellNet <- as.vector(unlist(Testing_Time_singleCellNet)) setwd(OutputDir) if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ write.csv(True_Labels_singleCellNet,paste('singleCellNet_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE) write.csv(Pred_Labels_singleCellNet,paste('singleCellNet_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE) write.csv(Training_Time_singleCellNet,paste('singleCellNet_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE) write.csv(Testing_Time_singleCellNet,paste('singleCellNet_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE) } else{ write.csv(True_Labels_singleCellNet,'singleCellNet_True_Labels.csv',row.names = FALSE) write.csv(Pred_Labels_singleCellNet,'singleCellNet_Pred_Labels.csv',row.names = FALSE) write.csv(Training_Time_singleCellNet,'singleCellNet_Training_Time.csv',row.names = FALSE) write.csv(Testing_Time_singleCellNet,'singleCellNet_Testing_Time.csv',row.names = FALSE) } } ================================================ FILE: Snakemake/Cross_Validation.R ================================================ args <- commandArgs(TRUE) Cross_Validation <- function(LabelsPath, col_Index = 1, OutputDir){ " Cross_Validation Function returns train and test indices for 5 folds stratified across unique cell populations, also filter out cell populations with less than 10 cells. It return a 'CV_folds.RData' file which then used as input to classifiers wrappers. Parameters ---------- LabelsPath : Cell population annotations file path (.csv). col_Index : column index (integer) defining which level of annotation to use, in case of multiple cell type annotations (default is 1) OutputDir : Output directory defining the path of the exported file. " Labels <- as.matrix(read.csv(LabelsPath)) Labels <- as.vector(Labels[,col_Index]) Removed_classes <- !(table(Labels) > 10) Cells_to_Keep <- !(is.element(Labels,names(Removed_classes)[Removed_classes])) Labels <- Labels[Cells_to_Keep] # Getting training and testing Folds library(rBayesianOptimization) n_folds = 5 Folds <- KFold(Labels,nfolds = n_folds, stratified = TRUE) Test_Folds <- c(n_folds:1) Train_Idx <- list() Test_Idx <- list() for (i in c(1:length(Folds))){ Temp_Folds <- Folds Temp_Folds[Test_Folds[i]] <- NULL Train_Idx[i] <- list(unlist(Temp_Folds)) Test_Idx[i] <- Folds[Test_Folds[i]] } remove(Temp_Folds,i,Folds) save(n_folds,Train_Idx,Test_Idx,col_Index,Cells_to_Keep,file = paste0(OutputDir, '/CV_folds.RData')) } Cross_Validation(args[1], as.numeric(args[2]), args[3]) ================================================ FILE: Snakemake/DEgenesMAST.R ================================================ DEgenesMAST <- function(Data, Labels, Normalize = FALSE, LogTransform = FALSE){ # This functions applies a differential expression test to the data using one vs all # The training data should be used a an input # The output is a matrix with marker genes where the columns are the cell populations and the rows are the top20 marker genes # This output can be rewritten to the format of the prior-knowledge-supervised classifiers and afterwards be used to classify the test set. # Data: genes X cells (rows = genes, columns = cells) # Labels: labels of the data # Normalize: the input for MAST should be cpm normalized data, # if the data is not normalized yet, this should be set to TRUE # LogTransform: the input for MAST should be logtransformed, # if the data is not logtransformed yet, this should be set to TRUE library(Seurat) if(Normalize) { Data <- apply(Data, 2, function(x) (x/sum(x))*1000000) } if(LogTransform) { Data <- log(Data+1, base = 2) } SeuObj <- CreateSeuratObject(raw.data = Data, project = "DEgenes") SeuObj <- SetIdent(SeuObj, ident.use = Labels) DEgenes <- FindAllMarkers(SeuObj, test.use = "MAST") Markers <- matrix(nrow = 20,ncol = length(unique(Labels))) colnames(Markers) <- unique(Labels) for (i in unique(Labels)){ i TempList <- DEgenes$gene[((DEgenes$cluster == i) & (DEgenes$avg_logFC > 0))] MarkerGenes <- DEgenes$p_val_adj[DEgenes$cluster == i] print(MarkerGenes[1:20]) if (length(TempList) >= 20){ Markers[,i] <- TempList[1:20] } else{ if(length(TempList) > 0){ Markers[c(1:length(TempList)),i] <- TempList } } } return(Markers) } ================================================ FILE: Snakemake/Dockerfiles/baseline/Dockerfile ================================================ FROM debian:9.9-slim # Install newest R version RUN apt-get update && \ apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \ wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \ echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \ apt-get update && \ apt-get install --no-install-recommends --yes r-base && \ apt-get purge --yes wget gnupg apt-transport-https && \ apt-get autoremove --yes && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* # Install python RUN apt-get update && \ apt-get install --no-install-recommends --yes python3 python3-pip && \ pip3 --no-cache-dir install setuptools && \ pip3 --no-cache-dir install pandas rpy2 scikit-learn statsmodels && \ rm -rf /var/lib/apt/lists/* COPY Scripts/run_kNN50.py \ Scripts/run_kNN9.py \ Scripts/run_LDA.py \ Scripts/run_LDA_rejection.py \ Scripts/run_NMC.py \ Scripts/run_RF.py \ Scripts/run_SVM.py \ Scripts/run_SVM_rejection.py \ rank_gene_dropouts.py \ /Scripts/ ================================================ FILE: Snakemake/Dockerfiles/cell_blast/Dockerfile ================================================ FROM python:3.7-slim-stretch # Install newest R version RUN apt-get update && \ apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \ wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \ echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \ apt-get update && \ apt-get install --no-install-recommends --yes r-base && \ apt-get purge --yes wget gnupg apt-transport-https && \ apt-get autoremove --yes && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* # Install python and pip deps RUN apt-get update && apt-get install --no-install-recommends --yes build-essential python3-dev libxml2 libxml2-dev zlib1g-dev && \ pip3 --no-cache-dir install --upgrade pip && \ pip3 --no-cache-dir install --upgrade setuptools && \ pip3 --no-cache-dir install pandas rpy2 scikit-learn statsmodels tensorflow Cell-BLAST && \ apt-get autoremove --yes && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* COPY Scripts/run_Cell_BLAST.py /Scripts/ ================================================ FILE: Snakemake/Dockerfiles/chetah/Dockerfile ================================================ FROM debian:9.9-slim # Install newest R version RUN apt-get update && \ apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \ wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \ echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \ apt-get update && \ apt-get install --no-install-recommends --yes r-base && \ apt-get purge --yes wget gnupg apt-transport-https && \ apt-get autoremove --yes && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* COPY Scripts/run_CHETAH.R \ Dockerfiles/chetah/install_packages.R \ /Scripts/ # Install R packages RUN apt-get update && \ apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \ Rscript --vanilla /Scripts/install_packages.R && \ apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \ apt-get autoremove --yes && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* ================================================ FILE: Snakemake/Dockerfiles/chetah/install_packages.R ================================================ withCallingHandlers({ install.packages("devtools", repos="https://cloud.r-project.org/") install.packages("BiocManager", repos="https://cloud.r-project.org/") BiocManager::install(c("bioDist", "ggplot2", "gplots", "cowplot", "dendextend", "corrplot", "reshape2", "plotly")) devtools::install_github("jdekanter/CHETAH", ref="b777e6f671bff3c434842adb655869a52bc9e368") }, warning = function(w) stop(w)) ================================================ FILE: Snakemake/Dockerfiles/cross_validation/Dockerfile ================================================ FROM debian:9.9-slim # Install newest R version RUN apt-get update && \ apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \ wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \ echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \ apt-get update && \ apt-get install --no-install-recommends --yes r-base && \ apt-get purge --yes wget gnupg apt-transport-https && \ apt-get autoremove --yes && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* COPY Cross_Validation.R \ Dockerfiles/cross_validation/install_packages.R \ /Scripts/ # Install R packages RUN apt-get update && \ apt-get install --no-install-recommends --yes make gcc libc6-dev g++ libxml2-dev && \ Rscript --vanilla /Scripts/install_packages.R && \ apt-get purge --yes make gcc g++ libxml2-dev && \ apt-get autoremove --yes && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* ================================================ FILE: Snakemake/Dockerfiles/cross_validation/install_packages.R ================================================ withCallingHandlers({ install.packages("lhs", repos="https://cloud.r-project.org/") install.packages("rBayesianOptimization", repos="https://cloud.r-project.org/") }, warning = function(w) stop(w)) ================================================ FILE: Snakemake/Dockerfiles/garnett/Dockerfile ================================================ FROM debian:9.9-slim # Install newest R version RUN apt-get update && \ apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \ wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \ echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \ apt-get update && \ apt-get install --no-install-recommends --yes r-base && \ apt-get purge --yes wget gnupg apt-transport-https && \ apt-get autoremove --yes && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* COPY Scripts/run_Garnett_CV.R \ Scripts/run_Garnett_Pretrained.R \ Dockerfiles/garnett/install_packages.R \ /Scripts/ # Install R packages RUN apt-get update && \ apt-get install --no-install-recommends --yes make gcc g++ libxml2-dev zlib1g-dev gfortran liblapack-dev libcurl4-gnutls-dev libssl-dev && \ Rscript --vanilla /Scripts/install_packages.R && \ apt-get purge --yes make gcc g++ zlib1g-dev gfortran liblapack-dev libcurl4-gnutls-dev libssl-dev && \ apt-get autoremove --yes && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* ================================================ FILE: Snakemake/Dockerfiles/garnett/install_packages.R ================================================ withCallingHandlers({ install.packages("BiocManager", repos="https://cloud.r-project.org/") BiocManager::install(c("monocle", "DelayedArray", "DelayedMatrixStats", "org.Hs.eg.db", "org.Mm.eg.db")) install.packages("devtools", repos="https://cloud.r-project.org/") devtools::install_github("cole-trapnell-lab/garnett", ref="9804b532bbcc1714b3ed0b718cf430741f1dba6c") }, warning = function(w) stop(w)) ================================================ FILE: Snakemake/Dockerfiles/scid/Dockerfile ================================================ FROM r-base:3.6.0 COPY Scripts/run_scID.R \ Dockerfiles/scid/install_packages.R \ /Scripts/ # Install R packages RUN apt-get update && \ apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \ Rscript --vanilla /Scripts/install_packages.R && \ apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \ apt-get autoremove --yes && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* ================================================ FILE: Snakemake/Dockerfiles/scid/install_packages.R ================================================ withCallingHandlers({ install.packages("BiocManager", repos="https://cloud.r-project.org/") BiocManager::install(ask = FALSE); BiocManager::install(c("scater", "MAST")) install.packages("devtools", repos="https://cloud.r-project.org/") devtools::install_github("satijalab/seurat") devtools::install_github("BatadaLab/scID") }, warning = function(w) stop(w)) ================================================ FILE: Snakemake/Dockerfiles/scmap/Dockerfile ================================================ FROM r-base:3.6.0 COPY Scripts/run_scmapcell.R \ Scripts/run_scmapcluster.R \ Dockerfiles/scmap/install_packages.R \ /Scripts/ # Install R packages RUN apt-get update && \ apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \ Rscript --vanilla /Scripts/install_packages.R && \ apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \ apt-get autoremove --yes && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* ================================================ FILE: Snakemake/Dockerfiles/scmap/install_packages.R ================================================ withCallingHandlers({ install.packages("BiocManager", repos="https://cloud.r-project.org/") BiocManager::install(ask = FALSE) BiocManager::install("SingleCellExperiment") install.packages("devtools", repos="https://cloud.r-project.org/") devtools::install_github("hemberg-lab/scmap") }, warning = function(w) stop(w)) ================================================ FILE: Snakemake/Dockerfiles/scvi/Dockerfile ================================================ FROM python:3.7-slim-stretch # Install newest R version RUN apt-get update && \ apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \ wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \ echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \ apt-get update && \ apt-get install --no-install-recommends --yes r-base && \ apt-get purge --yes wget gnupg apt-transport-https && \ apt-get autoremove --yes && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* # Install python and pip deps RUN apt-get update && apt-get install --no-install-recommends --yes build-essential python3-dev libxml2 libxml2-dev zlib1g-dev && \ pip3 --no-cache-dir install --upgrade pip && \ pip3 --no-cache-dir install --upgrade setuptools && \ pip3 --no-cache-dir install pandas rpy2 scikit-learn statsmodels tensorflow scvi && \ apt-get autoremove --yes && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* COPY Scripts/run_scVI.py /Scripts/ ================================================ FILE: Snakemake/Dockerfiles/singlecellnet/Dockerfile ================================================ FROM debian:9.9-slim # Install newest R version RUN apt-get update && \ apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \ wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \ echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \ apt-get update && \ apt-get install --no-install-recommends --yes r-base && \ apt-get purge --yes wget gnupg apt-transport-https && \ apt-get autoremove --yes && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* COPY Scripts/run_singleCellNet.R \ Dockerfiles/singlecellnet/install_packages.R \ /Scripts/ # Install R packages RUN apt-get update && \ apt-get install --no-install-recommends --yes make gcc libc6-dev g++ libcurl4-openssl-dev zlib1g-dev libssl-dev r-base-dev libxml2-dev && \ Rscript --vanilla /Scripts/install_packages.R && \ apt-get purge --yes make gcc g++ zlib1g-dev libcurl4-openssl-dev libc6-dev libssl-dev r-base-dev libxml2-dev && \ apt-get autoremove --yes && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* ================================================ FILE: Snakemake/Dockerfiles/singlecellnet/install_packages.R ================================================ withCallingHandlers({ install.packages("devtools", repos="https://cloud.r-project.org/") install.packages("BiocManager", repos="https://cloud.r-project.org/") BiocManager::install("fgsea") devtools::install_github("thomasp85/patchwork", ref="fd7958bae3e7a1e30237c751952e412a0a1d1242") devtools::install_github("pcahan1/singleCellNet", ref="4279a68112743b783cc82628421dd703261ec117") }, warning = function(w) stop(w)) ================================================ FILE: Snakemake/Dockerfiles/singler/Dockerfile ================================================ FROM debian:9.9-slim # Install newest R version RUN apt-get update && \ apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \ wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \ echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \ apt-get update && \ apt-get install --no-install-recommends --yes r-base && \ apt-get purge --yes wget gnupg apt-transport-https && \ apt-get autoremove --yes && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* COPY Scripts/run_SingleR.R \ Dockerfiles/singler/install_packages.R \ /Scripts/ RUN apt-get update && \ apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev libxml2 && \ Rscript --vanilla /Scripts/install_packages.R && \ apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \ apt-get autoremove --yes && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* ================================================ FILE: Snakemake/Dockerfiles/singler/install_packages.R ================================================ withCallingHandlers({ install.packages("devtools", repos="https://cloud.r-project.org/") install.packages("Seurat", repos="https://cloud.r-project.org/") devtools::install_github("dviraran/SingleR", ref="db4823b380ba2c3142c857c8c0695200dd1736f6") }, warning = function(w) stop(w)) ================================================ FILE: Snakemake/LICENSE ================================================ MIT License Copyright (c) 2019 tabdelaal Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: Snakemake/README.md ================================================ # scRNAseq_Benchmark Benchmarking classification tools for scRNA-seq data ## How to use [snakemake](https://snakemake.readthedocs.io/en/stable/index.html) and [singularity](https://www.sylabs.io/docs/) need to be available on your system. You will need to run this on a linux system, as singularity only supports linux. From the root of this repository: ``` snakemake \ --configfile \ --use-singularity ``` If your data or output directory is not located under the root of this repository, be sure to tell snakemake to mount the appropriate directories in singularity: ``` snakemake \ --configfile \ --use-singularity \ --singularity-args '--bind : --bind :' ``` #### The config file ```YML output_dir: datafile: labfile: column: number_of_features: genes: human: tools_to_run: # List of tools to run - - - <...> ``` ##### Tool specific inputs Some tools require specific inputs. Add the following to your config file when one of these tools: - Garnett_CV ```YML Garnett_CV: markers: ``` - Garnett_Pretrained ```YML Garnett_Pretrained: classifier: ``` ## Included tools/methods - kNN50 - kNN9 - LDA - LDA_rejection (LDA with rejection option) - NMC - RF - SVM - SVM (SVM with rejection option) - [singleCellNet](https://github.com/pcahan1/singleCellNet) - [CHETAH](https://github.com/jdekanter/CHETAH) - [scmap](https://github.com/hemberg-lab/scmap) - scmapcell - scmapcluster - [SingleR](https://github.com/dviraran/SingleR) - [scID](https://github.com/BatadaLab/scID) - [scVI](https://github.com/YosefLab/scVI) - [Cell_BLAST](https://github.com/gao-lab/Cell_BLAST) - [Garnett](https://cole-trapnell-lab.github.io/garnett/) - Garnett_CV (without pretrained classifier) - Garnett_Pretrained (with pretrained classifier) ## Adding new tools In order to add a tool to this benchmarking workflow, a rule for this tool needs to be added to the `Snakefile`. This rule should produce as output: - a table of predicted label (`/_pred.csv`). - a table of true labels (`/_true.csv`). - a tables of testing, prediction and/or total time: - `//_test_time.csv` - `//_training_time.csv` - `//_total_time.csv` The input to this rule should be: - a count table (specified as the `datafile` in the config). - a true labels file (specified as the `labfile` in the config). You will want to write a wrapper script for the tool you want to add to facilitate this. The `"{output_dir}/CV_folds.RData"` input may be used to provide your wrapper script with folds for cross_validation. It is recommended to make a docker image containing all dependencies for both the tool and any wrappers for the tool. This wrapper script should also make a selection of the features to be used. This selection should be based on ranking which can be accessed by providing `feature ranking` as input to the wrapper script. The number of features to be used should be configurable and settable through the 'number_of_features' field in the config. The following can be used as a template for new rules. Replace everything surrounded by (and including the) `<>` with appropriate values. ``` rule SVM: input: datafile = config["datafile"], labfile = config["labfile"], folds = "{output_dir}/CV_folds.RData", ranking = feature_ranking output: pred = "{output_dir}//_pred.csv", true = "{output_dir}//_true.csv", test_time = "{output_dir}//_test_time.csv", training_time = "{output_dir}//_training_time.csv" log: "{output_dir}//.log" params: n_features = config.get("number_of_features", 0) singularity: "docker://" shell: " " "{input.datafile} " "{input.labfile} " "{input.folds} " "{wildcards.output_dir}/ " "{input.ranking} " "{params.n_features} " "&> {log}" ``` ================================================ FILE: Snakemake/Scripts/run_ACTINN.py ================================================ import os import numpy as np import pandas as pd import time as tm import rpy2.robjects as robjects def run_ACTINN(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): ''' run ACTINN Wrapper script to run ACTINN on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the data data = pd.read_csv(DataPath,index_col=0,sep=',') labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) labels = labels.iloc[tokeep] data = data.iloc[tokeep] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') # folder with results os.chdir(OutputDir) tot=[] truelab = [] pred = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 train=data.iloc[train_ind_i] test=data.iloc[test_ind_i] y_train=labels.iloc[train_ind_i] y_test=labels.iloc[test_ind_i] if (NumGenes > 0): feat_to_use = features.iloc[0:NumGenes,i] train = train.iloc[:,feat_to_use] test = test.iloc[:,feat_to_use] train = train.transpose() test = test.transpose() train.to_csv("train.csv") test.to_csv("test.csv") y_train.to_csv("train_lab.csv", header = False, index = True, sep = '\t') y_test.to_csv("test_lab.csv", header = False, index = True, sep = '\t') tm.sleep(60) os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_format.py -i train.csv -o train -f csv") os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_format.py -i test.csv -o test -f csv") start = tm.time() os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_predict.py -trs train.h5 -trl train_lab.csv -ts test.h5") tot.append(tm.time()-start) tm.sleep(60) truelab.extend(y_test.values) predlabels = pd.read_csv('predicted_label.txt',header=0,index_col=None, sep='\t', usecols = [1]) pred.extend(predlabels.values) truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tot_time = pd.DataFrame(tot) if (NumGenes == 0): truelab.to_csv("ACTINN_True_Labels.csv", index = False) pred.to_csv("ACTINN_Pred_Labels.csv", index = False) tot_time.to_csv("ACTINN_Total_Time.csv", index = False) else: truelab.to_csv("ACTINN_" + str(NumGenes) + "_True_Labels.csv", index = False) pred.to_csv("ACTINN_" + str(NumGenes) + "_Pred_Labels.csv", index = False) tot_time.to_csv("ACTINN_" + str(NumGenes) + "_Total_Time.csv", index = False) ================================================ FILE: Snakemake/Scripts/run_CHETAH.R ================================================ args <- commandArgs(TRUE) run_CHETAH<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ " run CHETAH Wrapper script to run CHETAH on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is NULL. " Data <- read.csv(DataPath,row.names = 1) Labels <- as.matrix(read.csv(LabelsPath)) load(CV_RDataPath) Labels <- as.vector(Labels[,col_Index]) Data <- Data[Cells_to_Keep,] Labels <- Labels[Cells_to_Keep] if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ GenesOrder = read.csv(GeneOrderPath) } ############################################################################# # CHETAH # ############################################################################# library(CHETAH) library(SingleCellExperiment) True_Labels_CHETAH <- list() Pred_Labels_CHETAH <- list() Total_Time_CHETAH <- list() Data = t(as.matrix(Data)) for (i in c(1:n_folds)){ if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ sce <- SingleCellExperiment(assays = list(counts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), colData = data.frame(celltypes = Labels[Train_Idx[[i]]])) sce_test <- SingleCellExperiment(assays = list(counts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), colData = data.frame(celltypes = Labels[Test_Idx[[i]]])) start_time <- Sys.time() sce_test <- CHETAHclassifier(input = sce_test, ref_cells = sce, n_genes = NumGenes) end_time <- Sys.time() } else{ sce <- SingleCellExperiment(assays = list(counts = Data[,Train_Idx[[i]]]), colData = data.frame(celltypes = Labels[Train_Idx[[i]]])) sce_test <- SingleCellExperiment(assays = list(counts = Data[,Test_Idx[[i]]]), colData = data.frame(celltypes = Labels[Test_Idx[[i]]])) start_time <- Sys.time() sce_test <- CHETAHclassifier(input = sce_test, ref_cells = sce) end_time <- Sys.time() } Total_Time_CHETAH[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) True_Labels_CHETAH[i] <- list(Labels[Test_Idx[[i]]]) Pred_Labels_CHETAH[i] <- list(sce_test$celltype_CHETAH) } True_Labels_CHETAH <- as.vector(unlist(True_Labels_CHETAH)) Pred_Labels_CHETAH <- as.vector(unlist(Pred_Labels_CHETAH)) Total_Time_CHETAH <- as.vector(unlist(Total_Time_CHETAH)) write.csv(True_Labels_CHETAH,paste0(OutputDir,'/CHETAH_true.csv'),row.names = FALSE) write.csv(Pred_Labels_CHETAH,paste0(OutputDir,'/CHETAH_pred.csv'),row.names = FALSE) write.csv(Total_Time_CHETAH,paste0(OutputDir,'/CHETAH_total_time.csv'),row.names = FALSE) } if (args[6] == "0") { run_CHETAH(args[1], args[2], args[3], args[4]) } else { run_CHETAH(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6])) } ================================================ FILE: Snakemake/Scripts/run_CaSTLe.R ================================================ run_CaSTLe<-function(DataPath,LabelsPath,CV_RDataPath, OutputDir, GeneOrderPath = NULL, NumGenes = NULL){ " run CaSTLe Wrapper script to run CaSTLe on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is NULL. " Data <- read.csv(DataPath,row.names = 1) Labels <- as.matrix(read.csv(LabelsPath)) load(CV_RDataPath) Labels <- as.vector(Labels[,col_Index]) Data <- Data[Cells_to_Keep,] Labels <- Labels[Cells_to_Keep] if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ GenesOrder = read.csv(GeneOrderPath) } ############################################################################# # CaSTLe # ############################################################################# library(igraph) library(xgboost) True_Labels_Castle <- list() Pred_Labels_Castle <- list() Training_Time_Castle <- list() Testing_Time_Castle <- list() BREAKS=c(-1, 0, 1, 6, Inf) nFeatures = 100 for(i in c(1:n_folds)){ # 1. Load datasets if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ ds1 = Data[Train_Idx[[i]],as.vector(GenesOrder[c(1:NumGenes),i])+1] ds2 = Data[Test_Idx[[i]],as.vector(GenesOrder[c(1:NumGenes),i])+1] } else{ ds1 = Data[Train_Idx[[i]],] ds2 = Data[Test_Idx[[i]],] } sourceCellTypes = as.factor(Labels[Train_Idx[[i]]]) targetCellTypes = as.factor(Labels[Test_Idx[[i]]]) start_time <- Sys.time() # 2. Unify sets, excluding low expressed genes source_n_cells_counts = apply(ds1, 2, function(x) { sum(x > 0) } ) target_n_cells_counts = apply(ds2, 2, function(x) { sum(x > 0) } ) common_genes = intersect( colnames(ds1)[source_n_cells_counts>10], colnames(ds2)[target_n_cells_counts>10]) remove(source_n_cells_counts, target_n_cells_counts) ds1 = ds1[, colnames(ds1) %in% common_genes] ds2 = ds2[, colnames(ds2) %in% common_genes] ds = rbind(ds1[,common_genes], ds2[,common_genes]) isSource = c(rep(TRUE,nrow(ds1)), rep(FALSE,nrow(ds2))) remove(ds1, ds2) # 3. Highest mean in both source and target topFeaturesAvg = colnames(ds)[order(apply(ds, 2, mean), decreasing = T)] end_time <- Sys.time() Training_Time_Castle[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) start_time <- Sys.time() # for each cell - what is the most probable classification? L = length(levels(sourceCellTypes)) targetClassification = as.data.frame(matrix(rep(0,L*sum(!isSource)), nrow=L), row.names = levels(sourceCellTypes)) for (cellType in levels(sourceCellTypes)) { inSourceCellType = as.factor(ifelse(sourceCellTypes == cellType, cellType, paste0("NOT",cellType))) # 4. Highest mutual information in source topFeaturesMi = names(sort(apply(ds[isSource,],2,function(x) { compare(cut(x,breaks=BREAKS),inSourceCellType,method = "nmi") }), decreasing = T)) # 5. Top n genes that appear in both mi and avg selectedFeatures = union(head(topFeaturesAvg, nFeatures) , head(topFeaturesMi, nFeatures) ) # 6. remove correlated features tmp = cor(ds[,selectedFeatures], method = "pearson") tmp[!lower.tri(tmp)] = 0 selectedFeatures = selectedFeatures[apply(tmp,2,function(x) any(x < 0.9))] remove(tmp) # 7,8. Convert data from continous to binned dummy vars # break datasets to bins dsBins = apply(ds[, selectedFeatures], 2, cut, breaks= BREAKS) # use only bins with more than one value nUniq = apply(dsBins, 2, function(x) { length(unique(x)) }) # convert to dummy vars ds0 = model.matrix(~ . , as.data.frame(dsBins[,nUniq>1])) remove(dsBins, nUniq) cat(paste0("

Classifier for ",cellType,"

")) inTypeSource = sourceCellTypes == cellType # 9. Classify xg=xgboost(data=ds0[isSource,] , label=inTypeSource, objective="binary:logistic", eta=0.7 , nthread=1, nround=20, verbose=0, gamma=0.001, max_depth=5, min_child_weight=10) # 10. Predict inTypeProb = predict(xg, ds0[!isSource, ]) targetClassification[cellType,] = inTypeProb } end_time <- Sys.time() Testing_Time_Castle[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) True_Labels_Castle[i] <- list(Labels[Test_Idx[[i]]]) Pred_Labels_Castle[i] <- list(rownames(targetClassification)[apply(targetClassification,2,which.max)]) } True_Labels_Castle <- as.vector(unlist(True_Labels_Castle)) Pred_Labels_Castle <- as.vector(unlist(Pred_Labels_Castle)) Training_Time_Castle <- as.vector(unlist(Training_Time_Castle)) Testing_Time_Castle <- as.vector(unlist(Testing_Time_Castle)) if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ write.csv(True_Labels_Castle,paste('True_Labels_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE) write.csv(Pred_Labels_Castle,paste('Pred_Labels_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE) write.csv(Training_Time_Castle,paste('Training_Time_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE) write.csv(Testing_Time_Castle,paste('Testing_Time_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE) } else{ write.csv(True_Labels_Castle,'True_Labels_CaSTLe.csv',row.names = FALSE) write.csv(Pred_Labels_Castle,'Pred_Labels_CaSTLe.csv',row.names = FALSE) write.csv(Training_Time_Castle,'Training_Time_CaSTLe.csv',row.names = FALSE) write.csv(Testing_Time_Castle,'Testing_Time_CaSTLe.csv',row.names = FALSE) } } ================================================ FILE: Snakemake/Scripts/run_Cell_BLAST.py ================================================ import os from sys import argv from pathlib import Path import time as tm import pandas as pd import warnings warnings.filterwarnings("ignore") import tensorflow as tf tf.logging.set_verbosity(0) import Cell_BLAST as cb import numpy as np from numpy import genfromtxt as gft import rpy2.robjects as robjects def run_Cell_BLAST(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): ''' run Cell_BLAST Wrapper script to run Cell_BLAST on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') # read the data and labels data_old = cb.data.ExprDataSet.read_table(DataPath,orientation="cg", sep=",", index_col = 0, header = 0, sparsify = True).normalize() labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) data = cb.data.ExprDataSet(data_old.exprs[tokeep],data_old.obs.iloc[tokeep],data_old.var,data_old.uns) labels = gft(LabelsPath, dtype = "str", skip_header = 1, delimiter = ",", usecols = col) labels = labels[tokeep] truelab = [] pred = [] tr_time = [] ts_time = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 train=data[train_ind_i,:] test=data[test_ind_i,:] y_train = labels[train_ind_i] y_test = labels[test_ind_i] if (NumGenes > 0): feat_to_use = features.iloc[0:NumGenes,i] train = train[:,feat_to_use] test = test[:,feat_to_use] train.obs['cell_type'] = y_train start = tm.time() # reduce dimensions num_epoch = 50 models = [] for j in range(4): models.append(cb.directi.fit_DIRECTi(train, epoch=num_epoch, patience=10, random_seed = j, path="%d" % j)) # train model blast = cb.blast.BLAST(models, train).build_empirical() tr_time.append(tm.time()-start) # predict labels start = tm.time() test_pred = blast.query(test).annotate('cell_type') ts_time.append(tm.time()-start) truelab.extend(y_test) pred.extend(test_pred.values) #write results truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) truelab.to_csv(str(Path(OutputDir+"/Cell_BLAST_true.csv")),index = False) pred.to_csv(str(Path(OutputDir+"/Cell_BLAST_pred.csv")),index = False) tr_time.to_csv(str(Path(OutputDir+"/Cell_BLAST_training_time.csv")), index = False) ts_time.to_csv(str(Path(OutputDir+"/Cell_BLAST_test_time.csv")),index = False) run_Cell_BLAST(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6])) ================================================ FILE: Snakemake/Scripts/run_DigitalCellSorter.py ================================================ import numpy as np import pandas as pd import scripts.DigitalCellSorter as DigitalCellSorter import os import time as tm import rpy2.robjects as robjects def run_DigitalCellSorter(DataPath, LabelsPath, CV_RDataPath, GeneListPath, OutputDir, GeneOrderPath = "", NumGenes = 0): ''' run DigitalCellSorter Wrapper script to run DigitalCellSorter on a benchmark dataset using a predefined genelist, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. GeneListPath : Data file path to the genest. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 # read the data data = pd.read_csv(DataPath,index_col=0,sep=',') data = data.iloc[tokeep] truelab = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) truelab = truelab.iloc[tokeep] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') feat_to_use = features.iloc[0:NumGenes,0] data = data.iloc[:,feat_to_use] data = data.transpose() # number of different cell types in the data? n_clusters = 8 AvailableCPUsCount = 1 N_samples_for_distribution = 10000 start = tm.time() pred = DigitalCellSorter.DigitalCellSorter().Process(data, 'DigitalCellSorter_Zhang', saveDir = OutputDir, geneListFileName = GeneListPath, N_samples_for_distribution = N_samples_for_distribution, AvailableCPUsCount = AvailableCPUsCount, clusterIndex=None, clusterName=None, n_clusters=n_clusters) runtime = tm.time() - start os.chdir(OutputDir) results = pd.read_excel('DigitalCellSorter_Zhang_voting.xlsx',header=0,index_col=None, usecols=[11]) prediction = np.zeros(np.shape(pred), dtype='>U10') for i in range(len(results)): prediction[np.where(pred == i)] = results.values[i] prediction = pd.DataFrame(prediction) if (NumGenes == 0): truelab.to_csv("DigitalCellSorter_True_Labels.csv", index = False) prediction.to_csv("DigitalCellSorter_Pred_Labels.csv", index = False) with open("DigitalCellSorter_Total_Time.csv", 'w') as f: f.write("%f\n" % runtime) else: truelab.to_csv("DigitalCellSorter_" + str(NumGenes) + "_True_Labels.csv", index = False) prediction.to_csv("DigitalCellSorter_" + str(NumGenes) + "_Pred_Labels.csv", index = False) with open("DigitalCellSorter_" + str(NumGenes) + "_Total_Time.csv", 'w') as f: f.write("%f\n" % runtime) ================================================ FILE: Snakemake/Scripts/run_Garnett_CV.R ================================================ args <- commandArgs(TRUE) run_Garnett_CV <- function(DataPath, LabelsPath, CV_RDataPath, GenesPath, MarkerPath, OutputDir, Human){ " run Garnett Wrapper script to run Garnett on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. GenesPath : Path to the file with the genenames MarkerPath : Path to the file with marker genes OutputDir : Output directory defining the path of the exported file. Human : boolean indicating whether the dataset is human (TRUE) or mouse (FALSE) " # load needed libraries library(garnett) if (Human) { library(org.Hs.eg.db) } else { library(org.Mm.eg.db) } # load the CVFile load(CV_RDataPath) # read the labels labels <- as.matrix(read.csv(LabelsPath)) labels <- as.vector(labels[,col_Index]) labels <- labels[Cells_to_Keep] # read the data mat <- read.table(DataPath, sep = ",") data <- mat[-1,-1] data <- data[Cells_to_Keep,] data <- t(data) #ensure that the genes are rows, and the cells are columns cells <- mat[-1,1] cells <- cells[Cells_to_Keep] # read the genefile fdata <- read.table(GenesPath) names(fdata) <- 'gene_short_name' row.names(fdata) <- fdata$gene_short_name fd <- new("AnnotatedDataFrame", data = fdata) true_labels <- list() pred_labels <- list() train_time <- list() test_time <- list() for (i in c(1:n_folds)){ lab_train = labels[Train_Idx[[i]]] lab_test = labels[Test_Idx[[i]]] train = data[,Train_Idx[[i]]] test = data[,Test_Idx[[i]]] cells_train = cells[Train_Idx[[i]]] cells_test = cells[Test_Idx[[i]]] pdata_train = data.frame(cells_train) pdata_test = data.frame(cells_test) row.names(train) <- row.names(fdata) row.names(test) <- row.names(fdata) colnames(train) <- row.names(pdata_train) colnames(test) <- row.names(pdata_test) pd_train <- new("AnnotatedDataFrame", data = pdata_train) pd_test <- new("AnnotatedDataFrame", data = pdata_test) pbmc_cds_train <- newCellDataSet(as(train, "dgCMatrix"), phenoData = pd_train, featureData = fd) pbmc_cds_test <- newCellDataSet(as(test, "dgCMatrix"), phenoData = pd_test, featureData = fd) pbmc_cds_train <- estimateSizeFactors(pbmc_cds_train) pbmc_cds_test <- estimateSizeFactors(pbmc_cds_test) # training start_train <- Sys.time() if (Human){ pbmc_classifier <- train_cell_classifier(cds = pbmc_cds_train, marker_file = MarkerPath, db=org.Hs.eg.db, cds_gene_id_type = "SYMBOL", num_unknown = 50, marker_file_gene_id_type = "SYMBOL") } else { pbmc_classifier <- train_cell_classifier(cds = pbmc_cds_train, marker_file = MarkerPath, db=org.Mm.eg.db, cds_gene_id_type = "SYMBOL", num_unknown = 50, marker_file_gene_id_type = "SYMBOL") } end_train <- Sys.time() train_time[i] <- as.numeric(end_train - start_train) # testing start_test <- Sys.time() if (Human) { pbmc_cds_test <- classify_cells(pbmc_cds_test, pbmc_classifier, db = org.Hs.eg.db, cluster_extend = TRUE, cds_gene_id_type = "SYMBOL") } else { pbmc_cds_test <- classify_cells(pbmc_cds_test, pbmc_classifier, db = org.Mm.eg.db, cluster_extend = TRUE, cds_gene_id_type = "SYMBOL") } end_test <- Sys.time() test_time[i] <- as.numeric(end_test - start_test) true_labels[i] <- list(lab_test) pred_labels[i] <- list(pData(pbmc_cds_test)$cluster_ext_type) } true_labels <- as.vector(unlist(true_labels)) pred_labels <- as.vector(unlist(pred_labels)) train_time <- as.vector(unlist(train_time)) test_time <- as.vector(unlist(test_time)) write.csv(true_labels,paste0(OutputDir,'/Garnett_CV_true.csv'),row.names = FALSE) write.csv(pred_labels,paste0(OutputDir,'/Garnett_CV_pred.csv'),row.names = FALSE) write.csv(train_time,paste0(OutputDir,'/Garnett_CV_training_time.csv'),row.names = FALSE) write.csv(test_time,paste0(OutputDir,'/Garnett_CV_test_time.csv'),row.names = FALSE) } run_Garnett_CV(args[1], args[2], args[3], args[4], args[5], args[6], args[7]) ================================================ FILE: Snakemake/Scripts/run_Garnett_Pretrained.R ================================================ args <- commandArgs(TRUE) run_Garnett_Pretrained <- function(DataPath, LabelsPath, GenesPath, CV_RDataPath, ClassifierPath, OutputDir, Human){ " run Garnett Wrapper script to run Garnett on a benchmark dataset with a pretrained classifier, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. GenesPath : Path to the file with the genenames ClassifierPath : Path to the pretrained classifier OutputDir : Output directory defining the path of the exported file. Human : boolean indicating whether the dataset is human (TRUE) or mouse (FALSE) " # load needed libraries library(garnett) if (Human) { library(org.Hs.eg.db) } else { library(org.Mm.eg.db) } # load data, genes, and marker file load(CV_RDataPath) load(ClassifierPath) labels <- as.matrix(read.csv(LabelsPath)) labels <- labels[Cells_to_Keep] mat <- read.table(DataPath, sep = ",") data <- mat[-1,-1] data <- data[Cells_to_Keep,] data <- t(data) #ensure that the genes are rows, and the cells are columns barcodes <- mat[-1,1] pdata = data.frame(barcodes) fdata <- read.table(GenesPath) names(fdata) <- 'gene_short_name' row.names(fdata) <- fdata$gene_short_name row.names(data) <- row.names(fdata) colnames(data) <- row.names(pdata) pd <- new("AnnotatedDataFrame", data = pdata) fd <- new("AnnotatedDataFrame", data = fdata) pbmc_cds <- newCellDataSet(as(data, "dgCMatrix"), phenoData = pd, featureData = fd) start_time <- Sys.time() pbmc_cds <- estimateSizeFactors(pbmc_cds) if (Human){ pbmc_cds <- classify_cells(pbmc_cds, hsPBMC, db = org.Hs.eg.db, cluster_extend = TRUE, cds_gene_id_type = "SYMBOL") } else { pbmc_cds <- classify_cells(pbmc_cds, mmLung, db = org.Mm.eg.db, cluster_extend = TRUE, cds_gene_id_type = "SYMBOL") } end_time <- Sys.time() test_time <- as.numeric(end_time - start_time) write.table(pData(pbmc_cds)$cluster_ext_type, file = paste0(OutputDir, "/Garnett_Pretrained_pred.csv"), append = FALSE, quote = TRUE, sep = "\t", eol = "\n", na = "NA", dec = ".", row.names = FALSE, qmethod = c("escape", "double"), fileEncoding = "") write.csv(labels,paste0(OutputDir,"/Garnett_Pretrained_true.csv"), row.names = FALSE) write.csv(test_time,paste0(OutputDir,'/Garnett_Pretrained_test_time.csv'),row.names = FALSE) } run_Garnett_Pretrained(args[1], args[2], args[3], args[4], args[5], args[6], args[7]) ================================================ FILE: Snakemake/Scripts/run_LAmbDA.py ================================================ # -*- coding: utf-8 -*- """ Created on Thu May 23 13:51:15 2019 @author: Lieke """ import os import numpy as np import pandas as pd import time as tm import rpy2.robjects as robjects import tensorflow as tf import math import scipy.io as sio import optunity as opt from tensorflow.contrib.tensor_forest.python import tensor_forest from tensorflow.python.ops import resources def run_LAmbDA(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): ''' run LAmbDA classifier Wrapper script to run LAmbDA on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the data data = pd.read_csv(DataPath,index_col=0,sep=',') labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) labels = labels.iloc[tokeep] data = data.iloc[tokeep] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') # folder with results os.chdir(OutputDir) tr_time=[] ts_time=[] truelab = np.zeros([len(labels),1],dtype = int) predlab = np.zeros([len(labels),1],dtype = int) for i in range(np.squeeze(nfolds)): global X, Y, Gnp, Dnp, train, test, prt, cv test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 X = np.array(data) if (NumGenes > 0): X = np.log2(X/10+1) feat_to_use = features.iloc[0:NumGenes,i] X = X[:,feat_to_use] else: X = np.log2(np.transpose(select_feats(np.transpose(X),0.5,80))/10+1) uniq = np.unique(labels) Y = np.zeros([len(labels),len(uniq)],int) for j in range(len(uniq)): Y[np.where(labels == uniq[j])[0],j] = 1 Y = np.array(Y) Gnp = np.zeros([len(uniq),len(uniq)],int) np.fill_diagonal(Gnp,1) Gnp = np.array(Gnp) Dnp = np.ones([len(uniq),1],int) Dnp = np.array(Dnp) train_samp = int(np.floor(0.75*len(train_ind_i))) test_samp = len(train_ind_i) - train_samp perm = np.random.permutation(len(train_ind_i)) train = perm[0:train_samp] test = perm[train_samp:test_samp+1] while(np.sum(np.sum(Y[train,:],0)<5)>0): perm = np.random.permutation(X.shape[0]) train = perm[0:train_samp+1] test = perm[train_samp+1:train_samp+test_samp+1] cv = i optunity_it = 0 prt = False opt_params = None start=tm.time() opt_params, _, _ = opt.minimize(run_LAmbDA2,solver_name='sobol', gamma=[0.8,1.2], delta=[0.05,0.95], tau=[10.0,11.0], prc_cut=[20,50], bs_prc=[0.2,0.6], num_trees=[10,200], max_nodes=[100,1000], num_evals=50) tr_time.append(tm.time()-start) print("Finished training!") prt = True train = train_ind_i test = test_ind_i start=tm.time() err = run_LAmbDA2(opt_params['gamma'], opt_params['delta'], opt_params['tau'], opt_params['prc_cut'], opt_params['bs_prc'], opt_params['num_trees'], opt_params['max_nodes']) ts_time.append(tm.time()-start) tf.reset_default_graph(); predfile = 'preds_cv' + str(cv) + '.mat' truefile = 'truth_cv' + str(cv) + '.mat' pred = sio.loadmat(predfile) truth = sio.loadmat(truefile) pred = pred['preds'] truth = truth['labels'] pred_ind = np.argmax(pred,axis=1) truth_ind = np.argmax(truth,axis=1) predlab[test_ind_i,0] = pred_ind truelab[test_ind_i,0] = truth_ind truelab = pd.DataFrame(truelab) predlab = pd.DataFrame(predlab) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) if (NumGenes == 0): truelab.to_csv("LAmbDA_True_Labels.csv", index = False) predlab.to_csv("LAmbDA_Pred_Labels.csv", index = False) tr_time.to_csv("LAmbDA_Training_Time.csv", index = False) ts_time.to_csv("LAmbDA_Testing_Time.csv", index = False) else: truelab.to_csv("LAmbDA_" + str(NumGenes) + "_True_Labels.csv", index = False) predlab.to_csv("LAmbDA_" + str(NumGenes) + "_Pred_Labels.csv", index = False) tr_time.to_csv("LAmbDA_" + str(NumGenes) + "_Training_Time.csv", index = False) ts_time.to_csv("LAmbDA_" + str(NumGenes) + "_Testing_Time.csv", index = False) ##### Functions copied from LAmbDA's Github def wt_cutoff(colnum,cutoff,Gtmp,gamma): rowsums = np.sum(Gtmp,axis=1); return(math.ceil(cutoff*(math.log((max(rowsums)/rowsums[colnum])+1,2)**gamma))) def resample(prc_cut,Y,Gtmp,train,gamma): add = list() rem = list() colsums = np.sum(Y[train,:],axis=0); cutoff = math.ceil(np.percentile(colsums,prc_cut)); for i in range(len(colsums)): if colsums[i] == 0: pass elif colsums[i] < wt_cutoff(i,cutoff,Gtmp,gamma): idx = np.squeeze(np.array(np.where(Y[train,i]>=1))); choice = np.random.choice(train[idx],int(wt_cutoff(i,cutoff,Gtmp,gamma)-colsums[i])) add = add + choice.tolist(); elif colsums[i] == wt_cutoff(i,cutoff,Gtmp,gamma): pass else: idx = np.squeeze(np.array(np.where(Y[train,i]>=1))); choice = np.random.choice(train[idx],int(colsums[i]-wt_cutoff(i,cutoff,Gtmp,gamma)),replace=False) rem = rem + choice.tolist() return np.concatenate((list([val for val in train if val not in rem]),add)); def select_feats(Xtmp,num_zero_prc_cut,var_prc_cut): #********************************************************************* # remove features with many zeros num_feat_zeros = np.sum(Xtmp==0,axis=1); Xtmp = Xtmp[num_feat_zerosnp.percentile(feat_vars,var_prc_cut),:] return(Xtmp) def get_yn(predict,ys,delta,tau,output_feats): D = tf.cast(Dnp, tf.float32); G = tf.cast(Gnp, tf.float32); ys = tf.cast(ys, tf.float32); #print("start") Cm = tf.matmul(tf.transpose(tf.matmul(ys,D)),predict+0.1)/tf.reshape(tf.reduce_sum(tf.transpose(tf.matmul(ys,D)),1),(-1,1)); #print("1") mCm = tf.reshape(tf.reduce_mean(tf.cast(tf.matmul(tf.transpose(D),G)>0,tf.float32)*Cm,1),(-1,1)); #print("2") yw = tf.multiply(predict+0.1,tf.matmul(tf.matmul(ys,D),tf.pow(mCm/Cm,tau))); #print("3") ye = tf.multiply(tf.matmul(ys,G),yw); #print("4") yt = tf.matmul(ys,tf.matmul(tf.transpose(ys),ye)); #print("5") ya = (delta*yt)+((1-delta)*ye) #print("6") yn = tf.cast(tf.one_hot(tf.argmax(ya,axis=1),output_feats), dtype=tf.float32) #print("7") return(yn) def get_yi(rowsums,G2,ys): G2 = tf.cast(G2, tf.float32); ys = tf.cast(ys, tf.float32); yi = tf.cast(tf.matmul(ys,G2), dtype=tf.float32); return(yi) def run_LAmbDA2(gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes): global X, Y, Gnp, Dnp, train, test, prt, cv D = tf.cast(Dnp, tf.float32); G = tf.cast(Gnp, tf.float32); #optunity_it = optunity_it+1; num_trees = int(num_trees); max_nodes = int(max_nodes); prc_cut = int(np.ceil(prc_cut)); print("gamma=%.4f, delta=%.4f, tau=%.4f, prc_cut=%i, bs_prc=%.4f, num_trees=%i, max_nodes=%i" % (gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes)) input_feats = X.shape[1]; num_labls = G.shape.as_list(); output_feats = num_labls[1]; #print(output_feats) num_labls = num_labls[0]; rowsums = np.sum(Gnp,axis=1); train2 = resample(prc_cut, Y, Gnp, train, gamma); # Bug?? bs = int(np.ceil(bs_prc*train2.size)) xs = tf.placeholder(tf.float32, [None,input_feats]) #ys = tf.placeholder(tf.float32, [None,num_labls]) yin = tf.placeholder(tf.int32, [None]) print("Vars loaded xs and ys created") hparams = tensor_forest.ForestHParams(num_classes=output_feats, num_features=input_feats, num_trees=num_trees, max_nodes=max_nodes).fill() print("Tensor forest hparams created") forest_graph = tensor_forest.RandomForestGraphs(hparams) print("Tensor forest graph created") train_op = forest_graph.training_graph(xs, yin) loss_op = forest_graph.training_loss(xs, yin) print("Loss and train ops created") predict, _, _ = forest_graph.inference_graph(xs) print("Tensor forest variables created through predict") accuracy_op = tf.reduce_mean(tf.reduce_sum(tf.square(tf.one_hot(yin,output_feats)-predict),reduction_indices=[1])) print(tf.reduce_sum(tf.square(tf.one_hot(yin,output_feats)-predict),reduction_indices=[1])) #predict = tf.one_hot(pred); print("Lambda specific variables created") # Creating training and testing steps G2 = np.copy(Gnp); G2[rowsums>1,:] = 0; YI = np.matmul(Y,G2); YIrs = np.sum(YI,axis=1); trainI = train2[np.in1d(train2,np.where(YIrs==1))]; print("data type trainI,",trainI.dtype) testI = test[np.in1d(test,np.where(YIrs==1))]; print("trainI testI created") #init_vars=tf.global_variables_initializer() init_vars = tf.group(tf.global_variables_initializer(), resources.initialize_resources(resources.shared_resources())) sess = tf.Session() sess.run(init_vars) print("Session started") #beep = sess.run(predict,feed_dict={xs:X[1:100,:]}); #beep = sess.run(predict,feed_dict={xs:X[train2[0:bs],:]}); tensor_trainI = {xs: X[trainI, :], yin: sess.run(tf.argmax(get_yi(rowsums,G2,Y[trainI, :]),axis=1))} print("tensor_trainI made") tensor_testI = {xs: X[testI, :], yin: sess.run(tf.argmax(get_yi(rowsums,G2,Y[testI, :]),axis=1))} print("tensor_testI made") tensor_train = {xs: X[train2[0:bs], :], yin: sess.run(tf.argmax(get_yn(sess.run(predict,feed_dict={xs:X[train2[0:bs],:]}),Y[train2[0:bs], :],delta,tau,output_feats),axis=1))} print("tensor_train made") tensor_test = {xs: X[test, :], yin: sess.run(tf.argmax(get_yn(sess.run(predict,feed_dict={xs:X[test,:]}),Y[test, :],delta,tau,output_feats),axis=1))} print("tensor_test made") #********************************** #print("Loss and training steps created with sample tensors") # Setting params and initializing print("Beginning iterations") # Starting training iterations print(X.shape) for i in range(1,101): if i < 50: sess.run(train_op, feed_dict=tensor_trainI) #print("ran train op") if i % 10 == 0: print(str(sess.run(accuracy_op, feed_dict=tensor_trainI)) + ' ' + str(sess.run(accuracy_op, feed_dict=tensor_testI))) else: sess.run(train_op, feed_dict=tensor_train) if i % 10 == 0: print(str(sess.run(accuracy_op, feed_dict=tensor_train)) + ' ' + str(sess.run(accuracy_op, feed_dict=tensor_test))) elif i % 10 == 0: np.random_shuffle(train2); tensor_train = {xs: X[train2[0:bs], :], yin: sess.run(get_yn(sess.run(predict,feed_dict={xs:X[train2[0:bs],:]}),Y[train2[0:bs], :],delta,tau,output_feats))} if prt: blah = sess.run(predict, feed_dict=tensor_test); sio.savemat('preds_cv' + str(cv) + '.mat', {'preds': blah}); sio.savemat('truth_cv' + str(cv) + '.mat', {'labels': Y[test, :]}); acc = sess.run(accuracy_op, feed_dict=tensor_test) print("loss1=%.4f, gamma=%.4f, delta=%.4f, tau=%.4f, prc_cut=%i, bs_prc=%.4f, num_trees=%i, max_nodes=%i" % (acc, gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes)) tf.reset_default_graph(); return(acc) ================================================ FILE: Snakemake/Scripts/run_LDA.py ================================================ import os from sys import argv from pathlib import Path import numpy as np import pandas as pd import time as tm from sklearn.discriminant_analysis import LinearDiscriminantAnalysis import rpy2.robjects as robjects def run_LDA(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): ''' run baseline classifier: LDA Wrapper script to run LDA classifier on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the data data = pd.read_csv(DataPath,index_col=0,sep=',') labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) labels = labels.iloc[tokeep] data = data.iloc[tokeep] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') # normalize data data = np.log1p(data) Classifier = LinearDiscriminantAnalysis() tr_time=[] ts_time=[] truelab = [] pred = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 train=data.iloc[train_ind_i] test=data.iloc[test_ind_i] y_train=labels.iloc[train_ind_i] y_test=labels.iloc[test_ind_i] if (NumGenes > 0): feat_to_use = features.iloc[0:NumGenes,i] train = train.iloc[:,feat_to_use] test = test.iloc[:,feat_to_use] start=tm.time() Classifier.fit(train, y_train) tr_time.append(tm.time()-start) start=tm.time() predicted = Classifier.predict(test) ts_time.append(tm.time()-start) truelab.extend(y_test.values) pred.extend(predicted) truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) OutputDir = Path(OutputDir) truelab.to_csv(str(OutputDir / Path("LDA_true.csv")), index = False) pred.to_csv(str(OutputDir / Path("LDA_pred.csv")), index = False) tr_time.to_csv(str(OutputDir / Path("LDA_training_time.csv")), index = False) ts_time.to_csv(str(OutputDir / Path("LDA_test_time.csv")), index = False) run_LDA(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6])) ================================================ FILE: Snakemake/Scripts/run_LDA_rejection.py ================================================ import os from sys import argv from pathlib import Path import numpy as np import pandas as pd import time as tm from sklearn.discriminant_analysis import LinearDiscriminantAnalysis import rpy2.robjects as robjects def run_LDA_rejection(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0, Threshold = 0.7): ''' run baseline classifier: LDA Wrapper script to run LDA classifier on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. Threshold : Threshold used when rejecting the genes, default is 0.7. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the data data = pd.read_csv(DataPath,index_col=0,sep=',') labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) labels = labels.iloc[tokeep] data = data.iloc[tokeep] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') # normalize data data = np.log1p(data) Classifier = LinearDiscriminantAnalysis() tr_time=[] ts_time=[] truelab = [] pred = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 train=data.iloc[train_ind_i] test=data.iloc[test_ind_i] y_train=labels.iloc[train_ind_i] y_test=labels.iloc[test_ind_i] if (NumGenes > 0): feat_to_use = features.iloc[0:NumGenes,i] train = train.iloc[:,feat_to_use] test = test.iloc[:,feat_to_use] start=tm.time() Classifier.fit(train, y_train) tr_time.append(tm.time()-start) start=tm.time() predicted = Classifier.predict(test) prob = np.max(Classifier.predict_proba(test), axis = 1) unlabeled = np.where(prob < Threshold) predicted[unlabeled] = 'Unknown' ts_time.append(tm.time()-start) truelab.extend(y_test.values) pred.extend(predicted) truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) OutputDir = Path(OutputDir) truelab.to_csv(str(OutputDir / Path("LDA_rejection_true.csv")), index = False) pred.to_csv(str(OutputDir / Path("LDA_rejection_pred.csv")), index = False) tr_time.to_csv(str(OutputDir / Path("LDA_rejection_training_time.csv")), index = False) ts_time.to_csv(str(OutputDir / Path("LDA_rejection_test_time.csv")), index = False) run_LDA(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6])) ================================================ FILE: Snakemake/Scripts/run_NMC.py ================================================ import os from sys import argv from pathlib import Path import numpy as np import pandas as pd import time as tm from sklearn.neighbors import NearestCentroid import rpy2.robjects as robjects def run_NMC(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): ''' run baseline classifier: NMC Wrapper script to run a NMC classifier on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the data data = pd.read_csv(DataPath,index_col=0,sep=',') labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) labels = labels.iloc[tokeep] data = data.iloc[tokeep] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') # normalize data data = np.log1p(data) Classifier = NearestCentroid() tr_time=[] ts_time=[] truelab = [] pred = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 train=data.iloc[train_ind_i] test=data.iloc[test_ind_i] y_train=labels.iloc[train_ind_i] y_test=labels.iloc[test_ind_i] if (NumGenes > 0): feat_to_use = features.iloc[0:NumGenes,i] train = train.iloc[:,feat_to_use] test = test.iloc[:,feat_to_use] start=tm.time() Classifier.fit(train, y_train) tr_time.append(tm.time()-start) start=tm.time() predicted = Classifier.predict(test) ts_time.append(tm.time()-start) truelab.extend(y_test.values) pred.extend(predicted) truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) OutputDir = Path(OutputDir) truelab.to_csv(str(OutputDir / Path("NMC_true.csv")), index = False) pred.to_csv(str(OutputDir / Path("NMC_pred.csv")), index = False) tr_time.to_csv(str(OutputDir / Path("NMC_training_time.csv")), index = False) ts_time.to_csv(str(OutputDir / Path("NMC_test_time.csv")), index = False) run_NMC(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6])) ================================================ FILE: Snakemake/Scripts/run_RF.py ================================================ import os from sys import argv from pathlib import Path import numpy as np import pandas as pd import time as tm from sklearn.ensemble import RandomForestClassifier import rpy2.robjects as robjects def run_RF(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): ''' run baseline classifier: RF Wrapper script to run a RF classifier with 50 trees on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the data data = pd.read_csv(DataPath,index_col=0,sep=',') labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) labels = labels.iloc[tokeep] data = data.iloc[tokeep] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') # normalize data data = np.log1p(data) Classifier = RandomForestClassifier(n_estimators = 50) tr_time=[] ts_time=[] truelab = [] pred = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 train=data.iloc[train_ind_i] test=data.iloc[test_ind_i] y_train=labels.iloc[train_ind_i] y_test=labels.iloc[test_ind_i] if (NumGenes > 0): feat_to_use = features.iloc[0:NumGenes,i] train = train.iloc[:,feat_to_use] test = test.iloc[:,feat_to_use] start=tm.time() Classifier.fit(train, y_train) tr_time.append(tm.time()-start) start=tm.time() predicted = Classifier.predict(test) ts_time.append(tm.time()-start) truelab.extend(y_test.values) pred.extend(predicted) truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) OutputDir = Path(OutputDir) truelab.to_csv(str(OutputDir / Path("RF_true.csv")), index = False) pred.to_csv(str(OutputDir / Path("RF_pred.csv")), index = False) tr_time.to_csv(str(OutputDir / Path("RF_training_time.csv")), index = False) ts_time.to_csv(str(OutputDir / Path("RF_test_time.csv")), index = False) run_RF(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6])) ================================================ FILE: Snakemake/Scripts/run_SCINA.R ================================================ run_SCINA<-function(DataPath,LabelsPath,GeneSigPath,OutputDir){ " run SCINA Wrapper script to run SCINA on a benchmark dataset, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). GeneSigPath : Cell type marker genes file path (.csv) OutputDir : Output directory defining the path of the exported file. " Data <- read.csv(DataPath,row.names = 1) Labels <- as.vector(as.matrix(read.csv(LabelsPath))) Data <- Data[is.element(Labels,c('CD14+ Monocyte','CD19+ B','CD56+ NK')),] Labels <- Labels[is.element(Labels,c('CD14+ Monocyte','CD19+ B','CD56+ NK'))] Labels[Labels == 'CD14+ Monocyte'] <- 'CD14_Monocyte' Labels[Labels == 'CD19+ B'] <- 'CD19_B' Labels[Labels == 'CD56+ NK'] <- 'CD56_NK' ############################################################################# # SCINA # ############################################################################# library(SCINA) Signature_Genes <- preprocess.signatures(GeneSigPath) True_Labels_SCINA <- list() Pred_Labels_SCINA <- list() Total_Time_SCINA <- list() library(preprocessCore) Data = t(as.matrix(Data)) Data=log(Data+1) Data[]=normalize.quantiles(Data) start_time <- Sys.time() results = SCINA(Data, Signature_Genes) end_time <- Sys.time() True_Labels_SCINA <- Labels Pred_Labels_SCINA <- results$cell_labels Total_Time_SCINA <- as.numeric(difftime(end_time,start_time,units = 'secs')) setwd(OutputDir) write.csv(True_Labels_SCINA,'SCINA_True_Labels.csv',row.names = FALSE) write.csv(Pred_Labels_SCINA,'SCINA_Pred_Labels.csv',row.names = FALSE) write.csv(Total_Time_SCINA,'SCINA_Total_Time.csv',row.names = FALSE) } ================================================ FILE: Snakemake/Scripts/run_SVM.py ================================================ import os from sys import argv from pathlib import Path import numpy as np import pandas as pd import time as tm from sklearn.svm import LinearSVC import rpy2.robjects as robjects def run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): ''' run baseline classifier: SVM Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the data data = pd.read_csv(DataPath,index_col=0,sep=',') labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) labels = labels.iloc[tokeep] data = data.iloc[tokeep] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') # normalize data data = np.log1p(data) Classifier = LinearSVC() tr_time=[] ts_time=[] truelab = [] pred = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 train=data.iloc[train_ind_i] test=data.iloc[test_ind_i] y_train=labels.iloc[train_ind_i] y_test=labels.iloc[test_ind_i] if (NumGenes > 0): feat_to_use = features.iloc[0:NumGenes,i] train = train.iloc[:,feat_to_use] test = test.iloc[:,feat_to_use] start=tm.time() Classifier.fit(train, y_train) tr_time.append(tm.time()-start) start=tm.time() predicted = Classifier.predict(test) ts_time.append(tm.time()-start) truelab.extend(y_test.values) pred.extend(predicted) truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) OutputDir = Path(OutputDir) truelab.to_csv(str(OutputDir / Path("SVM_true.csv")), index = False) pred.to_csv(str(OutputDir / Path("SVM_pred.csv")), index = False) tr_time.to_csv(str(OutputDir / Path("SVM_training_time.csv")), index = False) ts_time.to_csv(str(OutputDir / Path("SVM_test_time.csv")), index = False) run_SVM(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6])) ================================================ FILE: Snakemake/Scripts/run_SVM_rejection.py ================================================ import os from sys import argv from pathlib import Path import numpy as np import pandas as pd import time as tm from sklearn.svm import LinearSVC import rpy2.robjects as robjects from sklearn.calibration import CalibratedClassifierCV def run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0, Threshold = 0.7): ''' run baseline classifier: SVM Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. Threshold : Threshold used when rejecting the cells, default is 0.7. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the data data = pd.read_csv(DataPath,index_col=0,sep=',') labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) labels = labels.iloc[tokeep] data = data.iloc[tokeep] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') # normalize data data = np.log1p(data) Classifier = LinearSVC() clf = CalibratedClassifierCV(Classifier) tr_time=[] ts_time=[] truelab = [] pred = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 train=data.iloc[train_ind_i] test=data.iloc[test_ind_i] y_train=labels.iloc[train_ind_i] y_test=labels.iloc[test_ind_i] if (NumGenes > 0): feat_to_use = features.iloc[0:NumGenes,i] train = train.iloc[:,feat_to_use] test = test.iloc[:,feat_to_use] start=tm.time() clf.fit(train, y_train) tr_time.append(tm.time()-start) start=tm.time() predicted = clf.predict(test) prob = np.max(clf.predict_proba(test), axis = 1) unlabeled = np.where(prob < Threshold) predicted[unlabeled] = 'Unknown' ts_time.append(tm.time()-start) truelab.extend(y_test.values) pred.extend(predicted) truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) OutputDir = Path(OutputDir) truelab.to_csv(str(OutputDir / Path("SVM_rejection_true.csv")), index = False) pred.to_csv(str(OutputDir / Path("SVM_rejection_pred.csv")), index = False) tr_time.to_csv(str(OutputDir / Path("SVM_rejection_training_time.csv")), index = False) ts_time.to_csv(str(OutputDir / Path("SVM_rejection_test_time.csv")), index = False) run_SVM(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6])) ================================================ FILE: Snakemake/Scripts/run_SingleR.R ================================================ args <- commandArgs(TRUE) run_SingleR<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ " run SingleR Wrapper script to run SingleR on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is NULL. " Data <- read.csv(DataPath,row.names = 1) Labels <- as.matrix(read.csv(LabelsPath)) load(CV_RDataPath) Labels <- as.vector(Labels[,col_Index]) Data <- Data[Cells_to_Keep,] Labels <- Labels[Cells_to_Keep] if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ GenesOrder = read.csv(GeneOrderPath) } ############################################################################# # SingleR # ############################################################################# library(SingleR) library(Seurat) True_Labels_SingleR <- list() Pred_Labels_SingleR <- list() Total_Time_SingleR <- list() Data = t(as.matrix(Data)) for (i in c(1:n_folds)){ if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ start_time <- Sys.time() singler = SingleR(method = "single", Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]], Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]], Labels[Train_Idx[[i]]], numCores = 1) end_time <- Sys.time() } else{ start_time <- Sys.time() singler = SingleR(method = "single", Data[,Test_Idx[[i]]], Data[,Train_Idx[[i]]], Labels[Train_Idx[[i]]], numCores = 1) end_time <- Sys.time() } Total_Time_SingleR[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) True_Labels_SingleR[i] <- list(Labels[Test_Idx[[i]]]) Pred_Labels_SingleR[i] <- list(as.vector(singler$labels)) } True_Labels_SingleR <- as.vector(unlist(True_Labels_SingleR)) Pred_Labels_SingleR <- as.vector(unlist(Pred_Labels_SingleR)) Total_Time_SingleR <- as.vector(unlist(Total_Time_SingleR)) write.csv(True_Labels_SingleR,paste0(OutputDir,'/SingleR_true.csv'),row.names = FALSE) write.csv(Pred_Labels_SingleR,paste0(OutputDir,'/SingleR_pred.csv'),row.names = FALSE) write.csv(Total_Time_SingleR,paste0(OutputDir,'/SingleR_total_time.csv'),row.names = FALSE) } if (args[6] == "0") { run_SingleR(args[1], args[2], args[3], args[4]) } else { run_SingleR(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6])) } ================================================ FILE: Snakemake/Scripts/run_kNN50.py ================================================ import os from sys import argv from pathlib import Path import numpy as np import pandas as pd import time as tm from sklearn.neighbors import KNeighborsClassifier import rpy2.robjects as robjects def run_kNN50(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): ''' run baseline classifiers: kNN Wrapper script to run kNN (with k = 50) classifier on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the data data = pd.read_csv(DataPath,index_col=0,sep=',') labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) labels = labels.iloc[tokeep] data = data.iloc[tokeep] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') # normalize data data = np.log1p(data) Classifier = KNeighborsClassifier(n_neighbors=50) tr_time=[] ts_time=[] truelab = [] pred = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 train=data.iloc[train_ind_i] test=data.iloc[test_ind_i] y_train=labels.iloc[train_ind_i] y_test=labels.iloc[test_ind_i] if (NumGenes > 0): feat_to_use = features.iloc[0:NumGenes,i] train = train.iloc[:,feat_to_use] test = test.iloc[:,feat_to_use] start=tm.time() Classifier.fit(train, y_train) tr_time.append(tm.time()-start) start=tm.time() predicted = Classifier.predict(test) ts_time.append(tm.time()-start) truelab.extend(y_test.values) pred.extend(predicted) truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) OutputDir = Path(OutputDir) truelab.to_csv(str(OutputDir / Path("kNN50_true.csv")), index = False) pred.to_csv(str(OutputDir / Path("kNN50_pred.csv")), index = False) tr_time.to_csv(str(OutputDir / Path("kNN50_training_time.csv")), index = False) ts_time.to_csv(str(OutputDir / Path("kNN50_test_time.csv")), index = False) run_kNN50(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6])) ================================================ FILE: Snakemake/Scripts/run_kNN9.py ================================================ import os from sys import argv from pathlib import Path import numpy as np import pandas as pd import time as tm from sklearn.neighbors import KNeighborsClassifier import rpy2.robjects as robjects def run_kNN9(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): ''' run baseline classifiers: kNN Wrapper script to run kNN (with k = 9) classifier on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the data data = pd.read_csv(DataPath,index_col=0,sep=',') labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) labels = labels.iloc[tokeep] data = data.iloc[tokeep] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') # normalize data data = np.log1p(data) Classifier = KNeighborsClassifier(n_neighbors=9) tr_time=[] ts_time=[] truelab = [] pred = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 train=data.iloc[train_ind_i] test=data.iloc[test_ind_i] y_train=labels.iloc[train_ind_i] y_test=labels.iloc[test_ind_i] if (NumGenes > 0): feat_to_use = features.iloc[0:NumGenes,i] train = train.iloc[:,feat_to_use] test = test.iloc[:,feat_to_use] start=tm.time() Classifier.fit(train, y_train) tr_time.append(tm.time()-start) start=tm.time() predicted = Classifier.predict(test) ts_time.append(tm.time()-start) truelab.extend(y_test.values) pred.extend(predicted) truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) OutputDir = Path(OutputDir) truelab.to_csv(str(OutputDir / Path("kNN9_true.csv")), index = False) pred.to_csv(str(OutputDir / Path("kNN9_pred.csv")), index = False) tr_time.to_csv(str(OutputDir / Path("kNN9_training_time.csv")), index = False) ts_time.to_csv(str(OutputDir / Path("kNN9_test_time.csv")), index = False) run_kNN50(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6])) ================================================ FILE: Snakemake/Scripts/run_moana.py ================================================ import os import pandas as pd import numpy as np from moana.core import ExpMatrix from moana.classify import CellTypeClassifier import time as tm import rpy2.robjects as robjects def run_moana(DataPath, LabelsPath, ClassifierPath, OutputDir, GeneOrderPath = "", NumGenes = 0): ''' run moana Wrapper script to run moana on a benchmark dataset with a pretrained classifier, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. ClassifierPath : Data file path to the pretrained classifier. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # # read the Rdata file # robjects.r['load'](CV_RDataPath) # # tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') # col = np.array(robjects.r['col_Index'], dtype = 'int') # col = col - 1 matrix = ExpMatrix.read_tsv(DataPath, sep = ',') # matrix = matrix.iloc[tokeep] truelab = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',') # truelab = truelab.iloc[tokeep] ct_old = ['CD19+ B','CD14+ Monocyte','CD4+/CD45RA+/CD25- Naive T','CD4+/CD45RO+ Memory','CD8+/CD45RA+ Naive Cytotoxic','Dendritic', 'CD56+ NK'] ct_new = ['B cells','CD14+ monocytes','Naive CD4+ T cells','Memory CD4+ T cells','Naive CD8+ T cells','Dendritic cells','NK cells'] tokeep2 = np.isin(truelab,ct_old) truelab = truelab[tokeep2] print(len(truelab)) matrix = matrix.iloc[np.squeeze(tokeep2)] for i in range(len(ct_old)): truelab.iloc[truelab == ct_old[i]] = ct_new[i] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') feat_to_use = features.iloc[0:NumGenes,0] matrix = matrix.iloc[:,feat_to_use] data = ExpMatrix(X = np.transpose(matrix.X), genes = matrix.cells, cells = matrix.genes) data.genes.name = 'Genes' data.cells.name = 'Cells' data.index.name = 'Genes' data.columns.name = 'Cells' clf = CellTypeClassifier.read_pickle(ClassifierPath) start = tm.time() predictions = clf.predict(data) runtime = tm.time() - start np.asarray(predictions) pred = pd.DataFrame(predictions) os.chdir(OutputDir) if (NumGenes == 0): truelab.to_csv("moana_True_Labels.csv", index = False) pred.to_csv("moana_Pred_Labels.csv", index = False) with open("moana_Total_Time.csv", 'w') as f: f.write("%f\n" % runtime) else: truelab.to_csv("moana_" + str(NumGenes) + "_True_Labels.csv", index = False) pred.to_csv("moana_" + str(NumGenes) + "_Pred_Labels.csv", index = False) with open("moana_" + str(NumGenes) + "_Total_Time.csv", 'w') as f: f.write("%f\n" % runtime) ================================================ FILE: Snakemake/Scripts/run_scID.R ================================================ args <- commandArgs(TRUE) run_scID<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ " run scID Wrapper script to run scID on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is NULL. " Data <- read.csv(DataPath,row.names = 1) Labels <- as.matrix(read.csv(LabelsPath)) load(CV_RDataPath) Labels <- as.vector(Labels[,col_Index]) Data <- Data[Cells_to_Keep,] Labels <- Labels[Cells_to_Keep] if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ GenesOrder = read.csv(GeneOrderPath) } ############################################################################# # scID # ############################################################################# library(scID) library(Seurat) True_Labels_scID <- list() Pred_Labels_scID <- list() Total_Time_scID <- list() Data = t(as.matrix(Data)) for (i in c(1:n_folds)){ if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ Train_Labels <- list(Labels[Train_Idx[[i]]]) names(Train_Labels[[1]]) <- colnames(Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]) start_time <- Sys.time() scID_output <- scid_multiclass(Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]], Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]], Train_Labels[[1]]) end_time <- Sys.time() } else{ Train_Labels <- list(Labels[Train_Idx[[i]]]) names(Train_Labels[[1]]) <- colnames(Data[,Train_Idx[[i]]]) start_time <- Sys.time() scID_output <- scid_multiclass(Data[,Test_Idx[[i]]], Data[,Train_Idx[[i]]], Train_Labels[[1]]) end_time <- Sys.time() } Total_Time_scID[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) True_Labels_scID[i] <- list(Labels[Test_Idx[[i]]]) Pred_Labels_scID[i] <- list(as.vector(scID_output$labels)) } True_Labels_scID <- as.vector(unlist(True_Labels_scID)) Pred_Labels_scID <- as.vector(unlist(Pred_Labels_scID)) Total_Time_scID <- as.vector(unlist(Total_Time_scID)) write.csv(Pred_Labels_scID, paste0(OutputDir,'/scID_pred.csv'),row.names = FALSE) write.csv(True_Labels_scID, paste0(OutputDir,'/scID_true.csv'),row.names = FALSE) write.csv(Total_Time_scID,paste0(OutputDir,'/scID_total_time.csv'),row.names = FALSE) } if (args[6] == "0") { run_scID(args[1], args[2], args[3], args[4]) } else { run_scID(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6])) } ================================================ FILE: Snakemake/Scripts/run_scPred.R ================================================ run_scPred<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ " run scPred Wrapper script to run scPred on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is NULL. " Data <- read.csv(DataPath,row.names = 1) Labels <- as.matrix(read.csv(LabelsPath)) load(CV_RDataPath) Labels <- as.vector(Labels[,col_Index]) Data <- Data[Cells_to_Keep,] Labels <- Labels[Cells_to_Keep] if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ GenesOrder = read.csv(GeneOrderPath) } ############################################################################# # scPred # ############################################################################# library(scPred) library(tidyverse) library(SingleCellExperiment) True_Labels_scPred <- list() Pred_Labels_scPred <- list() Training_Time_scPred <- list() Testing_Time_scPred <- list() Data = t(as.matrix(Data)) for (i in c(1:n_folds)){ if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]])) sce_counts <- normcounts(sce) sce_cpm <- apply(sce_counts, 2, function(x) (x/sum(x))*1000000) sce_metadata <- as.data.frame(colData(sce)) sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]])) sce_counts_test <- normcounts(sce_test) sce_cpm_test <- apply(sce_counts_test, 2, function(x) (x/sum(x))*1000000) sce_metadata_test <- as.data.frame(colData(sce_test)) } else{ sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]])) sce_counts <- normcounts(sce) sce_cpm <- apply(sce_counts, 2, function(x) (x/sum(x))*1000000) sce_metadata <- as.data.frame(colData(sce)) sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]])) sce_counts_test <- normcounts(sce_test) sce_cpm_test <- apply(sce_counts_test, 2, function(x) (x/sum(x))*1000000) sce_metadata_test <- as.data.frame(colData(sce_test)) } # scPred Training start_time <- Sys.time() set.seed(1234) scp <- eigenDecompose(sce_cpm) scPred::metadata(scp) <- sce_metadata scp <- getFeatureSpace(scp, pVar = 'cell_type1') # plotEigen(scp, group = 'cell_type1') scp <- trainModel(scp) # plotTrainProbs(scp) end_time <- Sys.time() Training_Time_scPred[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) # scPred Prediction start_time <- Sys.time() scp <- scPredict(scp,newData = sce_cpm_test) end_time <- Sys.time() Testing_Time_scPred[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) True_Labels_scPred[i] <- list(Labels[Test_Idx[[i]]]) Pred_Labels_scPred[i] <- list(getPredictions(scp)$predClass) } True_Labels_scPred <- as.vector(unlist(True_Labels_scPred)) Pred_Labels_scPred <- as.vector(unlist(Pred_Labels_scPred)) Training_Time_scPred <- as.vector(unlist(Training_Time_scPred)) Testing_Time_scPred <- as.vector(unlist(Testing_Time_scPred)) setwd(OutputDir) if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ write.csv(True_Labels_scPred,paste('scPred_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE) write.csv(Pred_Labels_scPred,paste('scPred_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE) write.csv(Training_Time_scPred,paste('scPred_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE) write.csv(Testing_Time_scPred,paste('scPred_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE) } else{ write.csv(True_Labels_scPred,'scPred_True_Labels.csv',row.names = FALSE) write.csv(Pred_Labels_scPred,'scPred_Pred_Labels.csv',row.names = FALSE) write.csv(Training_Time_scPred,'scPred_Training_Time.csv',row.names = FALSE) write.csv(Testing_Time_scPred,'scPred_Testing_Time.csv',row.names = FALSE) } } ================================================ FILE: Snakemake/Scripts/run_scVI.py ================================================ from scvi.dataset import CsvDataset import os from sys import argv from pathlib import Path from scvi.dataset import CsvDataset import numpy as np import pandas as pd from scvi.models import SCANVI from scvi.inference import SemiSupervisedTrainer import time as tm import rpy2.robjects as robjects def run_scVI(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): ''' run scVI Wrapper script to run scVI on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the data data = pd.read_csv(DataPath,index_col=0,sep=',') labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) labels = labels.iloc[tokeep] data = data.iloc[tokeep] # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') if (NumGenes == 0): #save labels as csv file with header and index column labels.to_csv('Labels_scvi.csv') data.to_csv('Data_scvi.csv') train = CsvDataset('Data_scvi.csv', save_path = "", sep = ",", labels_file = "Labels_scvi.csv", gene_by_cell = False) ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels) trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5) n_epochs = 200 truelab = [] pred = [] tr_time = [] ts_time = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 if (NumGenes > 0): feat_to_use = features.iloc[0:NumGenes,i] data2 = data.iloc[:,feat_to_use] labels.to_csv(OutputDir +'/Labels_scvi.csv') data2.to_csv(OutputDir +'/Data_scvi.csv') train = CsvDataset(OutputDir +'/Data_scvi.csv', save_path = "", sep = ",", labels_file = OutputDir +"/Labels_scvi.csv", gene_by_cell = False, new_n_genes = False) ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels) trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5) trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(indices=(train_ind_i).ravel(), shuffle = False) trainer_scanvi.labelled_set.to_monitor = ['ll','accuracy'] trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(indices=(test_ind_i).ravel(), shuffle = False) trainer_scanvi.unlabelled_set.to_monitor = ['ll','accuracy'] start = tm.time() trainer_scanvi.train(n_epochs) tr_time.append(tm.time()-start) ## labels of test set are in y_pred ## labels are returned in numbers, should be mapped back to the real labels ## indices are permutated start = tm.time() y_true, y_pred = trainer_scanvi.unlabelled_set.compute_predictions() ts_time.append(tm.time()-start) truelab.extend(y_true) pred.extend(y_pred) #write results truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) truelab.to_csv(str(Path(OutputDir + "/scVI_true.csv")), index=False) pred.to_csv(str(Path(OutputDir + "/scVI_pred.csv")), index=False) tr_time.to_csv(str(Path(OutputDir + "/scVI_training_time.csv")), index=False) ts_time.to_csv(str(Path(OutputDir + "/scVI_test_time.csv")), index=False) run_scVI(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6])) ================================================ FILE: Snakemake/Scripts/run_scmap.R ================================================ run_scmap <- function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ " run scmap Wrapper script to run scmap on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is NULL. " Data <- read.csv(DataPath,row.names = 1) Labels <- as.matrix(read.csv(LabelsPath)) load(CV_RDataPath) Labels <- as.vector(Labels[,col_Index]) Data <- Data[Cells_to_Keep,] Labels <- Labels[Cells_to_Keep] if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ GenesOrder = read.csv(GeneOrderPath) } ############################################################################# # scmap # ############################################################################# library(scmap) library(SingleCellExperiment) True_Labels_scmapcluster <- list() Pred_Labels_scmapcluster <- list() True_Labels_scmapcell <- list() Pred_Labels_scmapcell <- list() Training_Time_scmapcluster <- list() Testing_Time_scmapcluster <- list() Training_Time_scmapcell <- list() Testing_Time_scmapcell <- list() Data = t(as.matrix(Data)) for (i in c(1:n_folds)){ if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]])) logcounts(sce) <- log2(normcounts(sce) + 1) # use gene names as feature symbols rowData(sce)$feature_symbol <- rownames(sce) sce <- selectFeatures(sce, n_features = NumGenes, suppress_plot = TRUE) sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]])) logcounts(sce_test) <- log2(normcounts(sce_test) + 1) rowData(sce_test)$feature_symbol <- rownames(sce_test) sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData } else{ sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]])) logcounts(sce) <- log2(normcounts(sce) + 1) # use gene names as feature symbols rowData(sce)$feature_symbol <- rownames(sce) sce <- selectFeatures(sce, suppress_plot = TRUE) sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]])) logcounts(sce_test) <- log2(normcounts(sce_test) + 1) rowData(sce_test)$feature_symbol <- rownames(sce_test) sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData } # scmap-cluster start_time <- Sys.time() sce <- indexCluster(sce) end_time <- Sys.time() Training_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) start_time <- Sys.time() scmapCluster_results <- scmapCluster(projection = sce_test,index_list = list(metadata(sce)$scmap_cluster_index)) end_time <- Sys.time() Testing_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) True_Labels_scmapcluster[i] <- list(Labels[Test_Idx[[i]]]) Pred_Labels_scmapcluster[i] <- list(scmapCluster_results$combined_labs) # scmap-cell start_time <- Sys.time() set.seed(1) sce <- indexCell(sce) end_time <- Sys.time() Training_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) start_time <- Sys.time() scmapCell_results <- scmapCell(sce_test,list(metadata(sce)$scmap_cell_index)) scmapCell_clusters <- scmapCell2Cluster(scmapCell_results,list(as.character(colData(sce)$cell_type1))) end_time <- Sys.time() Testing_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) True_Labels_scmapcell[i] <- list(Labels[Test_Idx[[i]]]) Pred_Labels_scmapcell[i] <- list(scmapCell_clusters$combined_labs) } True_Labels_scmapcluster <- as.vector(unlist(True_Labels_scmapcluster)) Pred_Labels_scmapcluster <- as.vector(unlist(Pred_Labels_scmapcluster)) True_Labels_scmapcell <- as.vector(unlist(True_Labels_scmapcell)) Pred_Labels_scmapcell <- as.vector(unlist(Pred_Labels_scmapcell)) Training_Time_scmapcluster <- as.vector(unlist(Training_Time_scmapcluster)) Testing_Time_scmapcluster <- as.vector(unlist(Testing_Time_scmapcluster)) Training_Time_scmapcell <- as.vector(unlist(Training_Time_scmapcell)) Testing_Time_scmapcell <- as.vector(unlist(Testing_Time_scmapcell)) setwd(OutputDir) if (!is.null(GeneOrderPath) & !is.null (NumGenes)){ write.csv(True_Labels_scmapcluster,paste('scmapcluster_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE) write.csv(Pred_Labels_scmapcluster,paste('scmapcluster_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE) write.csv(True_Labels_scmapcell,paste('scmapcell_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE) write.csv(Pred_Labels_scmapcell,paste('scmapcell_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE) write.csv(Training_Time_scmapcluster,paste('scmapcluster_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE) write.csv(Testing_Time_scmapcluster,paste('scmapcluster_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE) write.csv(Training_Time_scmapcell,paste('scmapcell_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE) write.csv(Testing_Time_scmapcell,paste('scmapcell_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE) } else{ write.csv(True_Labels_scmapcluster,'scmapcluster_True_Labels.csv',row.names = FALSE) write.csv(Pred_Labels_scmapcluster,'scmapcluster_Pred_Labels.csv',row.names = FALSE) write.csv(True_Labels_scmapcell,'scmapcell_True_Labels.csv',row.names = FALSE) write.csv(Pred_Labels_scmapcell,'scmapcell_Pred_Labels.csv',row.names = FALSE) write.csv(Training_Time_scmapcluster,'scmapcluster_Training_Time.csv',row.names = FALSE) write.csv(Testing_Time_scmapcluster,'scmapcluster_Testing_Time.csv',row.names = FALSE) write.csv(Training_Time_scmapcell,'scmapcell_Training_Time.csv',row.names = FALSE) write.csv(Testing_Time_scmapcell,'scmapcell_Testing_Time.csv',row.names = FALSE) } } ================================================ FILE: Snakemake/Scripts/run_scmapcell.R ================================================ args <- commandArgs(TRUE) run_scmapcell <- function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ " run scmapcell Wrapper script to run scmap on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is NULL. " Data <- read.csv(DataPath,row.names = 1) Labels <- as.matrix(read.csv(LabelsPath)) load(CV_RDataPath) Labels <- as.vector(Labels[,col_Index]) Data <- Data[Cells_to_Keep,] Labels <- Labels[Cells_to_Keep] if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ GenesOrder = read.csv(GeneOrderPath) } ############################################################################# # scmap # ############################################################################# library(scmap) library(SingleCellExperiment) True_Labels_scmapcell <- list() Pred_Labels_scmapcell <- list() Training_Time_scmapcell <- list() Testing_Time_scmapcell <- list() Data = t(as.matrix(Data)) for (i in c(1:n_folds)){ if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]])) logcounts(sce) <- log2(normcounts(sce) + 1) # use gene names as feature symbols rowData(sce)$feature_symbol <- rownames(sce) sce <- selectFeatures(sce, n_features = NumGenes, suppress_plot = TRUE) sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]])) logcounts(sce_test) <- log2(normcounts(sce_test) + 1) rowData(sce_test)$feature_symbol <- rownames(sce_test) sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData } else{ sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]])) logcounts(sce) <- log2(normcounts(sce) + 1) # use gene names as feature symbols rowData(sce)$feature_symbol <- rownames(sce) sce <- selectFeatures(sce, suppress_plot = TRUE) sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]])) logcounts(sce_test) <- log2(normcounts(sce_test) + 1) rowData(sce_test)$feature_symbol <- rownames(sce_test) sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData } # scmap-cell start_time <- Sys.time() set.seed(1) sce <- indexCell(sce) end_time <- Sys.time() Training_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) start_time <- Sys.time() scmapCell_results <- scmapCell(sce_test,list(metadata(sce)$scmap_cell_index)) scmapCell_clusters <- scmapCell2Cluster(scmapCell_results,list(as.character(colData(sce)$cell_type1))) end_time <- Sys.time() Testing_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) True_Labels_scmapcell[i] <- list(Labels[Test_Idx[[i]]]) Pred_Labels_scmapcell[i] <- list(scmapCell_clusters$combined_labs) } True_Labels_scmapcell <- as.vector(unlist(True_Labels_scmapcell)) Pred_Labels_scmapcell <- as.vector(unlist(Pred_Labels_scmapcell)) Training_Time_scmapcell <- as.vector(unlist(Training_Time_scmapcell)) Testing_Time_scmapcell <- as.vector(unlist(Testing_Time_scmapcell)) write.csv(True_Labels_scmapcell,paste0(OutputDir,'/scmapcell_true.csv'),row.names = FALSE) write.csv(Pred_Labels_scmapcell,paste0(OutputDir,'/scmapcell_pred.csv'),row.names = FALSE) write.csv(Training_Time_scmapcell,paste0(OutputDir,'/scmapcell_training_time.csv'),row.names = FALSE) write.csv(Testing_Time_scmapcell,paste0(OutputDir,'/scmapcell_test_time.csv'),row.names = FALSE) } if (args[6] == "0") { run_scmapcell(args[1], args[2], args[3], args[4]) } else { run_scmapcell(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6])) } ================================================ FILE: Snakemake/Scripts/run_scmapcluster.R ================================================ args <- commandArgs(TRUE) run_scmapcluster <- function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ " run scmapcluster Wrapper script to run scmap on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is NULL. " Data <- read.csv(DataPath,row.names = 1) Labels <- as.matrix(read.csv(LabelsPath)) load(CV_RDataPath) Labels <- as.vector(Labels[,col_Index]) Data <- Data[Cells_to_Keep,] Labels <- Labels[Cells_to_Keep] if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ GenesOrder = read.csv(GeneOrderPath) } ############################################################################# # scmap # ############################################################################# library(scmap) library(SingleCellExperiment) True_Labels_scmapcluster <- list() Pred_Labels_scmapcluster <- list() Training_Time_scmapcluster <- list() Testing_Time_scmapcluster <- list() Data = t(as.matrix(Data)) for (i in c(1:n_folds)){ if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]])) logcounts(sce) <- log2(normcounts(sce) + 1) # use gene names as feature symbols rowData(sce)$feature_symbol <- rownames(sce) sce <- selectFeatures(sce, n_features = NumGenes, suppress_plot = TRUE) sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]])) logcounts(sce_test) <- log2(normcounts(sce_test) + 1) rowData(sce_test)$feature_symbol <- rownames(sce_test) sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData } else{ sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]])) logcounts(sce) <- log2(normcounts(sce) + 1) # use gene names as feature symbols rowData(sce)$feature_symbol <- rownames(sce) sce <- selectFeatures(sce, suppress_plot = TRUE) sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]])) logcounts(sce_test) <- log2(normcounts(sce_test) + 1) rowData(sce_test)$feature_symbol <- rownames(sce_test) sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData } # scmap-cluster start_time <- Sys.time() sce <- indexCluster(sce) end_time <- Sys.time() Training_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) start_time <- Sys.time() scmapCluster_results <- scmapCluster(projection = sce_test,index_list = list(metadata(sce)$scmap_cluster_index)) end_time <- Sys.time() Testing_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) True_Labels_scmapcluster[i] <- list(Labels[Test_Idx[[i]]]) Pred_Labels_scmapcluster[i] <- list(scmapCluster_results$combined_labs) } True_Labels_scmapcluster <- as.vector(unlist(True_Labels_scmapcluster)) Pred_Labels_scmapcluster <- as.vector(unlist(Pred_Labels_scmapcluster)) Training_Time_scmapcluster <- as.vector(unlist(Training_Time_scmapcluster)) Testing_Time_scmapcluster <- as.vector(unlist(Testing_Time_scmapcluster)) write.csv(True_Labels_scmapcluster,paste0(OutputDir,'/scmapcluster_true.csv'),row.names = FALSE) write.csv(Pred_Labels_scmapcluster,paste0(OutputDir,'/scmapcluster_pred.csv'),row.names = FALSE) write.csv(Training_Time_scmapcluster,paste0(OutputDir,'/scmapcluster_training_time.csv'),row.names = FALSE) write.csv(Testing_Time_scmapcluster,paste0(OutputDir,'/scmapcluster_test_time.csv'),row.names = FALSE) } if (args[6] == "0") { run_scmapcluster(args[1], args[2], args[3], args[4]) } else { run_scmapcluster(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6])) } ================================================ FILE: Snakemake/Scripts/run_scmaptotal.R ================================================ run_scmap <- function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ " run scmap Wrapper script to run scmap on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is NULL. " Data <- read.csv(DataPath,row.names = 1) Labels <- as.matrix(read.csv(LabelsPath)) load(CV_RDataPath) Labels <- as.vector(Labels[,col_Index]) Data <- Data[Cells_to_Keep,] Labels <- Labels[Cells_to_Keep] if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ GenesOrder = read.csv(GeneOrderPath) } ############################################################################# # scmap # ############################################################################# library(scmap) library(SingleCellExperiment) True_Labels_scmapcluster <- list() Pred_Labels_scmapcluster <- list() True_Labels_scmapcell <- list() Pred_Labels_scmapcell <- list() Training_Time_scmapcluster <- list() Testing_Time_scmapcluster <- list() Training_Time_scmapcell <- list() Testing_Time_scmapcell <- list() Data = t(as.matrix(Data)) for (i in c(1:n_folds)){ if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]])) logcounts(sce) <- log2(normcounts(sce) + 1) # use gene names as feature symbols rowData(sce)$feature_symbol <- rownames(sce) sce <- selectFeatures(sce, n_features = NumGenes, suppress_plot = TRUE) sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]])) logcounts(sce_test) <- log2(normcounts(sce_test) + 1) rowData(sce_test)$feature_symbol <- rownames(sce_test) sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData } else{ sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]])) logcounts(sce) <- log2(normcounts(sce) + 1) # use gene names as feature symbols rowData(sce)$feature_symbol <- rownames(sce) sce <- selectFeatures(sce, suppress_plot = TRUE) sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]])) logcounts(sce_test) <- log2(normcounts(sce_test) + 1) rowData(sce_test)$feature_symbol <- rownames(sce_test) sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData } # scmap-cluster start_time <- Sys.time() sce <- indexCluster(sce) end_time <- Sys.time() Training_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) start_time <- Sys.time() scmapCluster_results <- scmapCluster(projection = sce_test,index_list = list(metadata(sce)$scmap_cluster_index)) end_time <- Sys.time() Testing_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) True_Labels_scmapcluster[i] <- list(Labels[Test_Idx[[i]]]) Pred_Labels_scmapcluster[i] <- list(scmapCluster_results$combined_labs) # scmap-cell start_time <- Sys.time() set.seed(1) sce <- indexCell(sce) end_time <- Sys.time() Training_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) start_time <- Sys.time() scmapCell_results <- scmapCell(sce_test,list(metadata(sce)$scmap_cell_index)) scmapCell_clusters <- scmapCell2Cluster(scmapCell_results,list(as.character(colData(sce)$cell_type1))) end_time <- Sys.time() Testing_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) True_Labels_scmapcell[i] <- list(Labels[Test_Idx[[i]]]) Pred_Labels_scmapcell[i] <- list(scmapCell_clusters$combined_labs) } True_Labels_scmapcluster <- as.vector(unlist(True_Labels_scmapcluster)) Pred_Labels_scmapcluster <- as.vector(unlist(Pred_Labels_scmapcluster)) True_Labels_scmapcell <- as.vector(unlist(True_Labels_scmapcell)) Pred_Labels_scmapcell <- as.vector(unlist(Pred_Labels_scmapcell)) Training_Time_scmapcluster <- as.vector(unlist(Training_Time_scmapcluster)) Testing_Time_scmapcluster <- as.vector(unlist(Testing_Time_scmapcluster)) Training_Time_scmapcell <- as.vector(unlist(Training_Time_scmapcell)) Testing_Time_scmapcell <- as.vector(unlist(Testing_Time_scmapcell)) setwd(OutputDir) if (!is.null(GeneOrderPath) & !is.null (NumGenes)){ write.csv(True_Labels_scmapcluster,paste('scmapcluster_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE) write.csv(Pred_Labels_scmapcluster,paste('scmapcluster_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE) write.csv(True_Labels_scmapcell,paste('scmapcell_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE) write.csv(Pred_Labels_scmapcell,paste('scmapcell_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE) write.csv(Training_Time_scmapcluster,paste('scmapcluster_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE) write.csv(Testing_Time_scmapcluster,paste('scmapcluster_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE) write.csv(Training_Time_scmapcell,paste('scmapcell_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE) write.csv(Testing_Time_scmapcell,paste('scmapcell_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE) } else{ write.csv(True_Labels_scmapcluster,'scmapcluster_True_Labels.csv',row.names = FALSE) write.csv(Pred_Labels_scmapcluster,'scmapcluster_Pred_Labels.csv',row.names = FALSE) write.csv(True_Labels_scmapcell,'scmapcell_True_Labels.csv',row.names = FALSE) write.csv(Pred_Labels_scmapcell,'scmapcell_Pred_Labels.csv',row.names = FALSE) write.csv(Training_Time_scmapcluster,'scmapcluster_Training_Time.csv',row.names = FALSE) write.csv(Testing_Time_scmapcluster,'scmapcluster_Testing_Time.csv',row.names = FALSE) write.csv(Training_Time_scmapcell,'scmapcell_Training_Time.csv',row.names = FALSE) write.csv(Testing_Time_scmapcell,'scmapcell_Testing_Time.csv',row.names = FALSE) } } ================================================ FILE: Snakemake/Scripts/run_singleCellNet.R ================================================ args <- commandArgs(TRUE) run_singleCellNet<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){ " run singleCellNet Wrapper script to run singleCellNet on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is NULL. " Data <- read.csv(DataPath,row.names = 1) colnames(Data) <- gsub('_','.',colnames(Data), fixed = TRUE) Labels <- as.matrix(read.csv(LabelsPath)) load(CV_RDataPath) Labels <- as.vector(Labels[,col_Index]) Data <- Data[Cells_to_Keep,] Labels <- Labels[Cells_to_Keep] if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ GenesOrder = read.csv(GeneOrderPath) } ############################################################################# # singleCellNet # ############################################################################# library(singleCellNet) library(dplyr) True_Labels_singleCellNet <- list() Pred_Labels_singleCellNet <- list() Training_Time_singleCellNet <- list() Testing_Time_singleCellNet <- list() Data = t(as.matrix(Data)) # deals also with sparse matrix for(i in c(1:n_folds)){ if(!is.null(GeneOrderPath) & !is.null (NumGenes)){ DataTrain <- Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]] DataTest <- Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]] } else{ DataTrain <- Data[,Train_Idx[[i]]] DataTest <- Data[,Test_Idx[[i]]] } start_time <- Sys.time() cgenes2<-findClassyGenes(DataTrain, data.frame(Annotation = Labels[Train_Idx[[i]]]), "Annotation") cgenesA<-cgenes2[['cgenes']] grps<-cgenes2[['grps']] DataTrain<-as.matrix(DataTrain[cgenesA,]) xpairs<-ptGetTop(DataTrain, grps, ncores = 1) pdTrain<-query_transform(DataTrain[cgenesA, ], xpairs) rf<-sc_makeClassifier(pdTrain[xpairs,], genes=xpairs, groups=grps) end_time <- Sys.time() Training_Time_singleCellNet[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) start_time <- Sys.time() DataTest<-query_transform(DataTest[cgenesA,], xpairs) classRes <-rf_classPredict(rf, DataTest) end_time <- Sys.time() Testing_Time_singleCellNet[i] <- as.numeric(difftime(end_time,start_time,units = 'secs')) True_Labels_singleCellNet[i] <- list(Labels[Test_Idx[[i]]]) Pred_Labels_singleCellNet[i] <- list((rownames(classRes)[apply(classRes,2,which.max)])[1:length(Test_Idx[[i]])]) } True_Labels_singleCellNet <- as.vector(unlist(True_Labels_singleCellNet)) Pred_Labels_singleCellNet <- as.vector(unlist(Pred_Labels_singleCellNet)) Training_Time_singleCellNet <- as.vector(unlist(Training_Time_singleCellNet)) Testing_Time_singleCellNet <- as.vector(unlist(Testing_Time_singleCellNet)) write.csv(True_Labels_singleCellNet,paste0(OutputDir,'/singleCellNet_true.csv'),row.names = FALSE) write.csv(Pred_Labels_singleCellNet,paste0(OutputDir,'/singleCellNet_pred.csv'),row.names = FALSE) write.csv(Training_Time_singleCellNet,paste0(OutputDir,'/singleCellNet_training_time.csv'),row.names = FALSE) write.csv(Testing_Time_singleCellNet,paste0(OutputDir,'/singleCellNet_test_time.csv'),row.names = FALSE) } if (args[6] == "0") { run_singleCellNet(args[1], args[2], args[3], args[4]) } else { run_singleCellNet(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6])) } ================================================ FILE: Snakemake/Snakefile ================================================ dockerTag = "latest" #FIXME tagged versions def feature_ranking(w): if "feature_ranking" in config.keys(): return config["feature_ranking"] else: return "{output_dir}/rank_genes_dropouts.csv".format( output_dir=w.output_dir) """ One rule to... rule... them all... """ rule all: input: tool_outputs = expand( "{output_dir}/evaluation/{measure}/{tool}.csv", tool=config["tools_to_run"], output_dir=config["output_dir"], measure=["Confusion", "F1", "PopSize", "Summary"]) """ Rule for the result evaluation """ rule evaluate: input: true="{output_dir}/{tool}/{tool}_true.csv", pred="{output_dir}/{tool}/{tool}_pred.csv" output: "{output_dir}/evaluation/Confusion/{tool}.csv", "{output_dir}/evaluation/F1/{tool}.csv", "{output_dir}/evaluation/PopSize/{tool}.csv", "{output_dir}/evaluation/Summary/{tool}.csv", log: "{output_dir}/evaluation/{tool}.log" singularity: "docker://scrnaseqbenchmark/baseline:{}".format(dockerTag) shell: "Rscript evaluate.R " "{input.true} " "{input.pred} " "{wildcards.output_dir}/evaluation " "{wildcards.tool} " "&> {log}" """ Rule for creating cross validation folds """ rule generate_CV_folds: input: config["labfile"], output: "{output_dir}/CV_folds.RData" log: "{output_dir}/CV_folds.log" params: column = config.get("column", 1) # default to 1 singularity: "docker://scrnaseqbenchmark/cross_validation:{}".format(dockerTag) shell: "Rscript Cross_Validation.R " "{input} " "{params.column} " "{wildcards.output_dir} " "&> {log}" """ Rule for creating feature rank lists """ rule generate_dropouts_feature_rankings: input: datafile = config["datafile"], folds = "{output_dir}/CV_folds.RData" output: "{output_dir}/rank_genes_dropouts.csv" log: "{output_dir}/rank_genes_dropouts.log" singularity: "docker://scrnaseqbenchmark/baseline:{}".format(dockerTag) shell: "echo test > {wildcards.output_dir}/test\n" "python3 rank_gene_dropouts.py " "{input.datafile} " "{input.folds} " "{wildcards.output_dir} " "&> {log}" """ Rule for R based tools. """ rule singleCellNet: input: datafile = config["datafile"], labfile = config["labfile"], folds = "{output_dir}/CV_folds.RData", ranking = feature_ranking output: pred = "{output_dir}/singleCellNet/singleCellNet_pred.csv", true = "{output_dir}/singleCellNet/singleCellNet_true.csv", test_time = "{output_dir}/singleCellNet/singleCellNet_test_time.csv", training_time = "{output_dir}/singleCellNet/singleCellNet_training_time.csv" log: "{output_dir}/singleCellNet/singleCellNet.log" params: n_features = config.get("number_of_features", 0) singularity: "docker://scrnaseqbenchmark/singlecellnet:{}".format(dockerTag) shell: "Rscript Scripts/run_singleCellNet.R " "{input.datafile} " "{input.labfile} " "{input.folds} " "{wildcards.output_dir}/singleCellNet " "{input.ranking} " "{params.n_features} " "&> {log}" rule scmapcell: input: datafile = config["datafile"], labfile = config["labfile"], folds = "{output_dir}/CV_folds.RData", ranking = feature_ranking output: pred = "{output_dir}/scmapcell/scmapcell_pred.csv", true = "{output_dir}/scmapcell/scmapcell_true.csv", test_time = "{output_dir}/scmapcell/scmapcell_test_time.csv", training_time = "{output_dir}/scmapcell/scmapcell_training_time.csv" log: "{output_dir}/scmapcell/scmapcell.log" params: n_features = config.get("number_of_features", 0) singularity: "docker://scrnaseqbenchmark/scmap:{}".format(dockerTag) shell: "Rscript Scripts/run_scmapcell.R " "{input.datafile} " "{input.labfile} " "{input.folds} " "{wildcards.output_dir}/scmapcell " "{input.ranking} " "{params.n_features} " "&> {log}" rule scmapcluster: input: datafile = config["datafile"], labfile = config["labfile"], folds = "{output_dir}/CV_folds.RData", ranking = feature_ranking output: pred = "{output_dir}/scmapcluster/scmapcluster_pred.csv", true = "{output_dir}/scmapcluster/scmapcluster_true.csv", test_time = "{output_dir}/scmapcluster/scmapcluster_test_time.csv", training_time = "{output_dir}/scmapcluster/scmapcluster_training_time.csv" log: "{output_dir}/scmapcluster/scmapcluster.log" params: n_features = config.get("number_of_features", 0) singularity: "docker://scrnaseqbenchmark/scmap:{}".format(dockerTag) shell: "Rscript Scripts/run_scmapcluster.R " "{input.datafile} " "{input.labfile} " "{input.folds} " "{wildcards.output_dir}/scmapcluster " "{input.ranking} " "{params.n_features} " "&> {log}" rule scID: input: datafile = config["datafile"], labfile = config["labfile"], folds = "{output_dir}/CV_folds.RData", ranking = feature_ranking output: pred = "{output_dir}/scID/scID_pred.csv", true = "{output_dir}/scID/scID_true.csv", total_time = "{output_dir}/scID/scID_total_time.csv" log: "{output_dir}/scID/scID.log" params: n_features = config.get("number_of_features", 0) singularity: "docker://scrnaseqbenchmark/scid:{}".format(dockerTag) shell: "Rscript Scripts/run_scID.R " "{input.datafile} " "{input.labfile} " "{input.folds} " "{wildcards.output_dir}/scID " "{input.ranking} " "{params.n_features} " "&> {log}" rule CHETAH: input: datafile = config["datafile"], labfile = config["labfile"], folds = "{output_dir}/CV_folds.RData", ranking = feature_ranking output: pred = "{output_dir}/CHETAH/CHETAH_pred.csv", true = "{output_dir}/CHETAH/CHETAH_true.csv", total_time = "{output_dir}/CHETAH/CHETAH_total_time.csv" log: "{output_dir}/CHETAH/CHETAH.log" params: n_features = config.get("number_of_features", 0) singularity: "docker://scrnaseqbenchmark/chetah:{}".format(dockerTag) shell: "Rscript Scripts/run_CHETAH.R " "{input.datafile} " "{input.labfile} " "{input.folds} " "{wildcards.output_dir}/CHETAH " "{input.ranking} " "{params.n_features} " "&> {log}" rule SingleR: input: datafile = config["datafile"], labfile = config["labfile"], folds = "{output_dir}/CV_folds.RData", ranking = feature_ranking output: pred = "{output_dir}/SingleR/SingleR_pred.csv", true = "{output_dir}/SingleR/SingleR_true.csv", total_time = "{output_dir}/SingleR/SingleR_total_time.csv" log: "{output_dir}/SingleR/SingleR.log" params: n_features = config.get("number_of_features", 0) singularity: "docker://scrnaseqbenchmark/singler:{}".format(dockerTag) shell: "Rscript Scripts/run_SingleR.R " "{input.datafile} " "{input.labfile} " "{input.folds} " "{wildcards.output_dir}/SingleR " "{input.ranking} " "{params.n_features} " "&> {log}" #NOTE non-conformant to the rest of the rules. rule Garnett_CV: input: datafile = config["datafile"], labfile = config["labfile"], folds = "{output_dir}/CV_folds.RData", genes_names = config.get("genes", "UNSPECIFIEDFILE"), markers = config.get("Garnett_CV", {}).get( "markers", "UNSPECIFIEDFILE") output: pred = "{output_dir}/Garnett_CV/Garnett_CV_pred.csv", true = "{output_dir}/Garnett_CV/Garnett_CV_true.csv", test_time = "{output_dir}/Garnett_CV/Garnett_CV_test_time.csv", training_time = "{output_dir}/Garnett_CV/Garnett_CV_training_time.csv" log: "{output_dir}/Garnett_CV/Garnett_CV.log" params: human = "T" if config.get("human", True) else "F" singularity: "docker://scrnaseqbenchmark/garnett:{}".format(dockerTag) shell: "Rscript Scripts/run_Garnett_CV.R " "{input.datafile} " "{input.labfile} " "{input.folds} " "{input.genes_names} " "{input.markers} " "{wildcards.output_dir}/Garnett_CV " "{params.human} " "&> {log}" #NOTE non-conformant to the rest of the rules. rule Garnett_Pretrained: #TODO test this input: datafile = config["datafile"], labfile = config["labfile"], folds = "{output_dir}/CV_folds.RData", genes_names = config.get("genes", "UNSPECIFIEDFILE"), classifier = config.get("Garnett_Pretrained", {}).get( "classifier", "UNSPECIFIEDFILE") output: pred = "{output_dir}/Garnett_Pretrained/Garnett_Pretrained_pred.csv", true = "{output_dir}/Garnett_Pretrained/Garnett_Pretrained_true.csv", test_time = "{output_dir}/Garnett_Pretrained/Garnett_Pretrained_test_time.csv" log: "{output_dir}/Garnett_Pretrained/Garnett_Pretrained.log" params: human = "T" if config.get("human", True) else "F" singularity: "docker://scrnaseqbenchmark/garnett:{}".format(dockerTag) shell: "Rscript Scripts/run_Garnett_Pretrained.R " "{input.datafile} " "{input.labfile} " "{input.genes_names} " "{input.folds} " "{input.classifier} " "{wildcards.output_dir}/Garnett_Pretrained " "{params.human} " "&> {log}" """ Rules for python based tools. """ rule kNN50: input: datafile = config["datafile"], labfile = config["labfile"], folds = "{output_dir}/CV_folds.RData", ranking = feature_ranking output: pred = "{output_dir}/kNN50/kNN50_pred.csv", true = "{output_dir}/kNN50/kNN50_true.csv", test_time = "{output_dir}/kNN50/kNN50_test_time.csv", training_time = "{output_dir}/kNN50/kNN50_training_time.csv" log: "{output_dir}/kNN50/kNN50.log" params: n_features = config.get("number_of_features", 0) singularity: "docker://scrnaseqbenchmark/baseline:{}".format(dockerTag) shell: "python3 Scripts/run_kNN50.py " "{input.datafile} " "{input.labfile} " "{input.folds} " "{wildcards.output_dir}/kNN50 " "{input.ranking} " "{params.n_features} " "&> {log}" rule kNN9: input: datafile = config["datafile"], labfile = config["labfile"], folds = "{output_dir}/CV_folds.RData", ranking = feature_ranking output: pred = "{output_dir}/kNN9/kNN9_pred.csv", true = "{output_dir}/kNN9/kNN9_true.csv", test_time = "{output_dir}/kNN9/kNN9_test_time.csv", training_time = "{output_dir}/kNN9/kNN9_training_time.csv" log: "{output_dir}/kNN9/kNN9.log" params: n_features = config.get("number_of_features", 0) singularity: "docker://scrnaseqbenchmark/baseline:{}".format(dockerTag) shell: "python3 Scripts/run_kNN9.py " "{input.datafile} " "{input.labfile} " "{input.folds} " "{wildcards.output_dir}/kNN9 " "{input.ranking} " "{params.n_features} " "&> {log}" rule Cell_BLAST: input: datafile = config["datafile"], labfile = config["labfile"], folds = "{output_dir}/CV_folds.RData", ranking = feature_ranking output: pred = "{output_dir}/Cell_BLAST/Cell_BLAST_pred.csv", true = "{output_dir}/Cell_BLAST/Cell_BLAST_true.csv", test_time = "{output_dir}/Cell_BLAST/Cell_BLAST_test_time.csv", training_time = "{output_dir}/Cell_BLAST/Cell_BLAST_training_time.csv" log: "{output_dir}/Cell_BLAST/Cell_BLAST.log" params: n_features = config.get("number_of_features", 0) singularity: "docker://scrnaseqbenchmark/cell_blast:{}".format(dockerTag) shell: "python3 Scripts/run_Cell_BLAST.py " "{input.datafile} " "{input.labfile} " "{input.folds} " "{wildcards.output_dir}/Cell_BLAST " "{input.ranking} " "{params.n_features} " "&> {log}" rule scVI: input: datafile = config["datafile"], labfile = config["labfile"], folds = "{output_dir}/CV_folds.RData", ranking = feature_ranking output: pred = "{output_dir}/scVI/scVI_pred.csv", true = "{output_dir}/scVI/scVI_true.csv", test_time = "{output_dir}/scVI/scVI_test_time.csv", training_time = "{output_dir}/scVI/scVI_training_time.csv" log: "{output_dir}/scVI/scVI.log" params: n_features = config.get("number_of_features", 0) singularity: "docker://scrnaseqbenchmark/scvi:{}".format(dockerTag) shell: "python3 Scripts/run_scVI.py " "{input.datafile} " "{input.labfile} " "{input.folds} " "{wildcards.output_dir}/scVI " "{input.ranking} " "{params.n_features} " "&> {log}" rule LDA: input: datafile = config["datafile"], labfile = config["labfile"], folds = "{output_dir}/CV_folds.RData", ranking = feature_ranking output: pred = "{output_dir}/LDA/LDA_pred.csv", true = "{output_dir}/LDA/LDA_true.csv", test_time = "{output_dir}/LDA/LDA_test_time.csv", training_time = "{output_dir}/LDA/LDA_training_time.csv" log: "{output_dir}/LDA/LDA.log" params: n_features = config.get("number_of_features", 0) singularity: "docker://scrnaseqbenchmark/baseline:{}".format(dockerTag) shell: "python3 Scripts/run_LDA.py " "{input.datafile} " "{input.labfile} " "{input.folds} " "{wildcards.output_dir}/LDA " "{input.ranking} " "{params.n_features} " "&> {log}" rule LDA_rejection: input: datafile = config["datafile"], labfile = config["labfile"], folds = "{output_dir}/CV_folds.RData", ranking = feature_ranking output: pred = "{output_dir}/LDA_rejection/LDA_rejection_pred.csv", true = "{output_dir}/LDA_rejection/LDA_rejection_true.csv", test_time = "{output_dir}/LDA_rejection/LDA_rejection_test_time.csv", training_time = "{output_dir}/LDA_rejection/LDA_rejection_training_time.csv" log: "{output_dir}/LDA_rejection/LDA_rejection.log" params: n_features = config.get("number_of_features", 0) singularity: "docker://scrnaseqbenchmark/baseline:{}".format(dockerTag) shell: "python3 Scripts/run_LDA_rejection.py " "{input.datafile} " "{input.labfile} " "{input.folds} " "{wildcards.output_dir}/LDA_rejection " "{input.ranking} " "{params.n_features} " "&> {log}" rule NMC: input: datafile = config["datafile"], labfile = config["labfile"], folds = "{output_dir}/CV_folds.RData", ranking = feature_ranking output: pred = "{output_dir}/NMC/NMC_pred.csv", true = "{output_dir}/NMC/NMC_true.csv", test_time = "{output_dir}/NMC/NMC_test_time.csv", training_time = "{output_dir}/NMC/NMC_training_time.csv" log: "{output_dir}/NMC/NMC.log" params: n_features = config.get("number_of_features", 0) singularity: "docker://scrnaseqbenchmark/baseline:{}".format(dockerTag) shell: "python3 Scripts/run_NMC.py " "{input.datafile} " "{input.labfile} " "{input.folds} " "{wildcards.output_dir}/NMC " "{input.ranking} " "{params.n_features} " "&> {log}" rule RF: input: datafile = config["datafile"], labfile = config["labfile"], folds = "{output_dir}/CV_folds.RData", ranking = feature_ranking output: pred = "{output_dir}/RF/RF_pred.csv", true = "{output_dir}/RF/RF_true.csv", test_time = "{output_dir}/RF/RF_test_time.csv", training_time = "{output_dir}/RF/RF_training_time.csv" log: "{output_dir}/RF/RF.log" params: n_features = config.get("number_of_features", 0) singularity: "docker://scrnaseqbenchmark/baseline:{}".format(dockerTag) shell: "python3 Scripts/run_RF.py " "{input.datafile} " "{input.labfile} " "{input.folds} " "{wildcards.output_dir}/RF " "{input.ranking} " "{params.n_features} " "&> {log}" rule SVM: input: datafile = config["datafile"], labfile = config["labfile"], folds = "{output_dir}/CV_folds.RData", ranking = feature_ranking output: pred = "{output_dir}/SVM/SVM_pred.csv", true = "{output_dir}/SVM/SVM_true.csv", test_time = "{output_dir}/SVM/SVM_test_time.csv", training_time = "{output_dir}/SVM/SVM_training_time.csv" log: "{output_dir}/SVM/SVM.log" params: n_features = config.get("number_of_features", 0) singularity: "docker://scrnaseqbenchmark/baseline:{}".format(dockerTag) shell: "python3 Scripts/run_SVM.py " "{input.datafile} " "{input.labfile} " "{input.folds} " "{wildcards.output_dir}/SVM " "{input.ranking} " "{params.n_features} " "&> {log}" rule SVM_rejection: input: datafile = config["datafile"], labfile = config["labfile"], folds = "{output_dir}/CV_folds.RData", ranking = feature_ranking output: pred = "{output_dir}/SVM_rejection/SVM_rejection_pred.csv", true = "{output_dir}/SVM_rejection/SVM_rejection_true.csv", test_time = "{output_dir}/SVM_rejection/SVM_rejection_test_time.csv", training_time = "{output_dir}/SVM_rejection/SVM_rejection_training_time.csv" log: "{output_dir}/SVM_rejection/SVM_rejection.log" params: n_features = config.get("number_of_features", 0) singularity: "docker://scrnaseqbenchmark/baseline:{}".format(dockerTag) shell: "python3 Scripts/run_SVM_rejection.py " "{input.datafile} " "{input.labfile} " "{input.folds} " "{wildcards.output_dir}/SVM_rejection " "{input.ranking} " "{params.n_features} " "&> {log}" ================================================ FILE: Snakemake/evaluate.R ================================================ args <- commandArgs(TRUE) TrueLabelsPath <- args[1] PredLabelsPath <- args[2] OutputDir <- args[3] ToolName <- args[4] evaluate <- function(TrueLabelsPath, PredLabelsPath, Indices = NULL){ " Script to evaluate the performance of the classifier. It returns multiple evaluation measures: the confusion matrix, median F1-score, F1-score for each class, accuracy, percentage of unlabeled, population size. The percentage of unlabeled cells is find by checking for cells that are labeled 'Unassigned', 'unassigned', 'Unknown', 'unknown', 'Nodexx', 'rand', or 'ambiguous'. Parameters ---------- TrueLabelsPath: csv file with the true labels (format: one column, no index) PredLabelsPath: csv file with the predicted labels (format: one column, no index) Indices: which part of the csv file should be read (e.g. if more datasets are tested at the same time) (format: c(begin, end)) Returns ------- Conf: confusion matrix MedF1 : median F1-score F1 : F1-score per class Acc : accuracy PercUnl : percentage of unlabeled cells PopSize : number of cells per cell type " true_lab <- unlist(read.csv(TrueLabelsPath)) pred_lab <- unlist(read.csv(PredLabelsPath)) if (! is.null(Indices)){ true_lab <- true_lab[Indices] pred_lab <- pred_lab[Indices] } unique_true <- unlist(unique(true_lab)) unique_pred <- unlist(unique(pred_lab)) unique_all <- unique(c(unique_true,unique_pred)) conf <- table(true_lab,pred_lab) pop_size <- rowSums(conf) pred_lab = gsub('Node..','Node',pred_lab) conf_F1 <- table(true_lab,pred_lab,exclude = c('unassigned','Unassigned','Unknown','rand','Node','ambiguous','unknown')) F1 <- vector() sum_acc <- 0 for (i in c(1:length(unique_true))){ findLabel = colnames(conf_F1) == row.names(conf_F1)[i] if(sum(findLabel)){ prec <- conf_F1[i,findLabel] / colSums(conf_F1)[findLabel] rec <- conf_F1[i,findLabel] / rowSums(conf_F1)[i] if (prec == 0 || rec == 0){ F1[i] = 0 } else{ F1[i] <- (2*prec*rec) / (prec + rec) } sum_acc <- sum_acc + conf_F1[i,findLabel] } else { F1[i] = 0 } } pop_size <- pop_size[pop_size > 0] names(F1) <- names(pop_size) med_F1 <- median(F1) total <- length(pred_lab) num_unlab <- sum(pred_lab == 'unassigned') + sum(pred_lab == 'Unassigned') + sum(pred_lab == 'rand') + sum(pred_lab == 'Unknown') + sum(pred_lab == 'unknown') + sum(pred_lab == 'Node') + sum(pred_lab == 'ambiguous') per_unlab <- num_unlab / total acc <- sum_acc/sum(conf_F1) result <- list(Conf = conf, MedF1 = med_F1, F1 = F1, Acc = acc, PercUnl = per_unlab, PopSize = pop_size) return(result) } results <- evaluate(TrueLabelsPath, PredLabelsPath) write.csv(results$Conf, file.path(OutputDir, "Confusion", paste0(ToolName, ".csv"))) write.csv(results$F1, file.path(OutputDir, "F1", paste0(ToolName, ".csv"))) write.csv(results$PopSize, file.path(OutputDir, "PopSize", paste0(ToolName, ".csv"))) df <- data.frame(results[c("MedF1", "Acc", "PercUnl")]) write.csv(df, file.path(OutputDir, "Summary", paste0(ToolName, ".csv"))) ================================================ FILE: Snakemake/example.config.yml ================================================ output_dir: output datafile: input/data.csv labfile: input/Labels.csv column: 1 number_of_features: 0 tools_to_run: - Cell_BLAST - scVI - scmapcell ================================================ FILE: Snakemake/rank_gene_dropouts.py ================================================ import os from sys import argv from pathlib import Path import rpy2.robjects as robjects import numpy as np import pandas as pd from sklearn import linear_model def rank_gene_dropouts(DataPath, CV_RDataPath, OutputDir): ''' Script to rank the genes in the training set of the inputfile based on their dropout level. This rank is written to a file. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') train_ind = np.array(robjects.r['Train_Idx']) # read the data data = pd.read_csv(DataPath,index_col=0,sep=',') data = data.iloc[tokeep] data = np.log2(data+1) genes = np.zeros([np.shape(data)[1],np.squeeze(nfolds)], dtype = '>U10') for i in range(np.squeeze(nfolds)): train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 train=data.iloc[train_ind_i] train.columns = np.arange(len(train.columns)) # rank genes training set dropout = (train == 0).sum(axis='rows') dropout = (dropout / train.shape[0]) * 100 mean = train.mean(axis='rows') notzero = np.where((np.array(mean) > 0) & (np.array(dropout) > 0))[0] zero = np.where(~((np.array(mean) > 0) & (np.array(dropout) > 0)))[0] train_notzero = train.iloc[:,notzero] train_zero = train.iloc[:,zero] zero_genes = train_zero.columns dropout = dropout.iloc[notzero] mean = mean.iloc[notzero] dropout = np.log2(np.array(dropout)).reshape(-1,1) mean = np.array(mean).reshape(-1,1) reg = linear_model.LinearRegression() reg.fit(mean,dropout) residuals = dropout - reg.predict(mean) residuals = pd.Series(np.array(residuals).ravel(),index=train_notzero.columns) residuals = residuals.sort_values(ascending=False) sorted_genes = residuals.index sorted_genes = sorted_genes.append(zero_genes) genes[:,i] = sorted_genes.values genes = pd.DataFrame(genes) genes.to_csv(str(OutputDir / Path("rank_genes_dropouts.csv")), index = False) rank_gene_dropouts(argv[1], argv[2], argv[3]) ================================================ FILE: evaluate.R ================================================ evaluate <- function(TrueLabelsPath, PredLabelsPath, Indices = NULL){ " Script to evaluate the performance of the classifier. It returns multiple evaluation measures: the confusion matrix, median F1-score, F1-score for each class, accuracy, percentage of unlabeled, population size. The percentage of unlabeled cells is find by checking for cells that are labeled 'Unassigned', 'unassigned', 'Unknown', 'unknown', 'Nodexx', 'rand', or 'ambiguous'. Parameters ---------- TrueLabelsPath: csv file with the true labels (format: one column, no index) PredLabelsPath: csv file with the predicted labels (format: one column, no index) Indices: which part of the csv file should be read (e.g. if more datasets are tested at the same time) (format: c(begin, end)) Returns ------- Conf: confusion matrix MedF1 : median F1-score F1 : F1-score per class Acc : accuracy PercUnl : percentage of unlabeled cells PopSize : number of cells per cell type " true_lab <- unlist(read.csv(TrueLabelsPath)) pred_lab <- unlist(read.csv(PredLabelsPath)) if (! is.null(Indices)){ true_lab <- true_lab[Indices] pred_lab <- pred_lab[Indices] } unique_true <- unlist(unique(true_lab)) unique_pred <- unlist(unique(pred_lab)) unique_all <- unique(c(unique_true,unique_pred)) conf <- table(true_lab,pred_lab) pop_size <- rowSums(conf) pred_lab = gsub('Node..','Node',pred_lab) conf_F1 <- table(true_lab,pred_lab,exclude = c('unassigned','Unassigned','Unknown','rand','Node','ambiguous','unknown')) F1 <- vector() sum_acc <- 0 for (i in c(1:length(unique_true))){ findLabel = colnames(conf_F1) == row.names(conf_F1)[i] if(sum(findLabel)){ prec <- conf_F1[i,findLabel] / colSums(conf_F1)[findLabel] rec <- conf_F1[i,findLabel] / rowSums(conf_F1)[i] if (prec == 0 || rec == 0){ F1[i] = 0 } else{ F1[i] <- (2*prec*rec) / (prec + rec) } sum_acc <- sum_acc + conf_F1[i,findLabel] } else { F1[i] = 0 } } pop_size <- pop_size[pop_size > 0] names(F1) <- names(pop_size) med_F1 <- median(F1) total <- length(pred_lab) num_unlab <- sum(pred_lab == 'unassigned') + sum(pred_lab == 'Unassigned') + sum(pred_lab == 'rand') + sum(pred_lab == 'Unknown') + sum(pred_lab == 'unknown') + sum(pred_lab == 'Node') + sum(pred_lab == 'ambiguous') per_unlab <- num_unlab / total acc <- sum_acc/sum(conf_F1) result <- list(Conf = conf, MedF1 = med_F1, F1 = F1, Acc = acc, PercUnl = per_unlab, PopSize = pop_size) return(result) } ================================================ FILE: rank_gene_dropouts.py ================================================ import os import rpy2.robjects as robjects import numpy as np import pandas as pd from sklearn import linear_model def rank_gene_dropouts(DataPath, CV_RDataPath, OutputDir): ''' Script to rank the genes in the training set of the inputfile based on their dropout level. This rank is written to a file. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') train_ind = np.array(robjects.r['Train_Idx']) # read the data data = pd.read_csv(DataPath,index_col=0,sep=',') data = data.iloc[tokeep] data = np.log2(data+1) genes = np.zeros([np.shape(data)[1],np.squeeze(nfolds)], dtype = '>U10') for i in range(np.squeeze(nfolds)): train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 train=data.iloc[train_ind_i] train.columns = np.arange(len(train.columns)) # rank genes training set dropout = (train == 0).sum(axis='rows') dropout = (dropout / train.shape[0]) * 100 mean = train.mean(axis='rows') notzero = np.where((np.array(mean) > 0) & (np.array(dropout) > 0))[0] zero = np.where(~((np.array(mean) > 0) & (np.array(dropout) > 0)))[0] train_notzero = train.iloc[:,notzero] train_zero = train.iloc[:,zero] zero_genes = train_zero.columns dropout = dropout.iloc[notzero] mean = mean.iloc[notzero] dropout = np.log2(np.array(dropout)).reshape(-1,1) mean = np.array(mean).reshape(-1,1) reg = linear_model.LinearRegression() reg.fit(mean,dropout) residuals = dropout - reg.predict(mean) residuals = pd.Series(np.array(residuals).ravel(),index=train_notzero.columns) residuals = residuals.sort_values(ascending=False) sorted_genes = residuals.index sorted_genes = sorted_genes.append(zero_genes) genes[:,i] = sorted_genes.values genes = pd.DataFrame(genes) os.chdir(OutputDir) genes.to_csv("rank_genes_dropouts.csv", index = False)