[
  {
    "path": "Cross_Validation.R",
    "content": "Cross_Validation <- function(LabelsPath, col_Index = 1,OutputDir){\r\n  \"\r\n  Cross_Validation\r\n  Function returns train and test indices for 5 folds stratified across unique cell populations,\r\n  also filter out cell populations with less than 10 cells.\r\n  It return a 'CV_folds.RData' file which then used as input to classifiers wrappers.\r\n  \r\n  Parameters\r\n  ----------\r\n  LabelsPath : Cell population annotations file path (.csv).\r\n  col_Index : column index (integer) defining which level of annotation to use,\r\n  in case of multiple cell type annotations (default is 1)\r\n  OutputDir : Output directory defining the path of the exported file.\r\n  \"\r\n  \r\n  Labels <- as.matrix(read.csv(LabelsPath))\r\n  Labels <- as.vector(Labels[,col_Index])\r\n  \r\n  Removed_classes <- !(table(Labels) > 10)\r\n  Cells_to_Keep <- !(is.element(Labels,names(Removed_classes)[Removed_classes]))\r\n  Labels <- Labels[Cells_to_Keep]\r\n  \r\n  # Getting training and testing Folds\r\n  library(rBayesianOptimization)\r\n  n_folds = 5\r\n  Folds <- KFold(Labels,nfolds = n_folds, stratified = TRUE)\r\n  Test_Folds <- c(n_folds:1)\r\n  Train_Idx <- list()\r\n  Test_Idx <- list()\r\n  for (i in c(1:length(Folds))){\r\n    Temp_Folds <- Folds\r\n    Temp_Folds[Test_Folds[i]] <- NULL\r\n    Train_Idx[i] <- list(unlist(Temp_Folds))\r\n    Test_Idx[i] <- Folds[Test_Folds[i]]\r\n  }\r\n  remove(Temp_Folds,i,Folds)\r\n  setwd(OutputDir)\r\n  save(n_folds,Train_Idx,Test_Idx,col_Index,Cells_to_Keep,file = 'CV_folds.RData')\r\n}"
  },
  {
    "path": "DEgenesMAST.R",
    "content": "DEgenesMAST <- function(Data, Labels, Normalize = FALSE, LogTransform = FALSE){\r\n  # This functions applies a differential expression test to the data using one vs all\r\n  # The training data should be used a an input\r\n  # The output is a matrix with marker genes where the columns are the cell populations and the rows are the top20 marker genes\r\n  # This output can be rewritten to the format of the prior-knowledge-supervised classifiers and afterwards be used to classify the test set.\r\n  \r\n  # Data: genes X cells (rows = genes, columns = cells)\r\n  # Labels: labels of the data\r\n  # Normalize: the input for MAST should be cpm normalized data, \r\n  #            if the data is not normalized yet, this should be set to TRUE\r\n  # LogTransform: the input for MAST should be logtransformed,\r\n  #            if the data is not logtransformed yet, this should be set to TRUE\r\n  \r\n  \r\n  library(Seurat)\r\n  \r\n  if(Normalize)\r\n  {\r\n    Data <- apply(Data, 2, function(x) (x/sum(x))*1000000)\r\n  }\r\n  \r\n  if(LogTransform)\r\n  {\r\n    Data <- log(Data+1, base = 2)\r\n  }\r\n  SeuObj <- CreateSeuratObject(raw.data = Data, project = \"DEgenes\")\r\n  SeuObj <- SetIdent(SeuObj, ident.use = Labels)\r\n  DEgenes <- FindAllMarkers(SeuObj, test.use = \"MAST\")\r\n  Markers <- matrix(nrow = 20,ncol = length(unique(Labels)))\r\n  colnames(Markers) <- unique(Labels)\r\n  for (i in unique(Labels)){\r\n    i\r\n    TempList <- DEgenes$gene[((DEgenes$cluster == i) & (DEgenes$avg_logFC > 0))]\r\n    MarkerGenes <- DEgenes$p_val_adj[DEgenes$cluster == i]\r\n    print(MarkerGenes[1:20])\r\n    if (length(TempList) >= 20){\r\n      Markers[,i] <- TempList[1:20]\r\n    }\r\n    else{\r\n      if(length(TempList) > 0){\r\n        Markers[c(1:length(TempList)),i] <- TempList\r\n      }\r\n    }\r\n  }\r\n  return(Markers)\r\n}\r\n"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2019 tabdelaal\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "# A comparison of automatic cell identification methods for single-cell RNA-sequencing data\nWe present a comprehensive evaluation of the performance of state-of-the-art classification methods, in addition to general-purpose classifiers, for automatic cell identification single cell RNA-sequencing datasets. Our goal is to provide the community with a fair evaluation of all available methods to facilitate the users’ choice as well as direct further developments to focus on the challenging aspects of automated cell type identification. (published in genome biology Sep. 2019 https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1795-z)\n\n### Repository description\nWe provide all the scripts to run and evaluate all classifiers, and to reproduce the results introduced in the paper.\n\n\n1. 'Scripts' folder contains a wrapper function to read the data and apply certain classification method.\n2. ```Cross_Validation``` R script can be used to produce training and test indices for cross validation.\n3. ```rank_gene_dropouts``` Python script can be used to apply feature selection using the dropout method, and rank genes accordingly.\n4. ```evaluate``` R script can be used to evaluate the prediction of a certain classifier and obtain scores such as accuracy, median F1-score and % unlabeld cells.\n\nFor more details, please check function documentations.\n\n### General Usage\n\nTo benchmark and fairly evaluate the performance of different classifiers using benchmark-datasets (Filtered datasets can be downloaded from https://zenodo.org/record/3357167), apply the following steps:\n\n#### Step 1\n\nApply the ```Cross_Validation``` R function on the corresponding dataset to obtain fixed training and test cell indices, straitified across different cell types. For example, using the Tabula Muris (TM) dataset\n\n```R\nCross_Validation('~/TM/Labels.csv', 1, '~/TM/')\n```\n\nThis command will create a ```CV_folds.RData``` file used as input in Step 2.\n\n#### Step 2\n\nRun each classifier wrapper. For example, running scPred on TM dataset\n\n```R\nrun_scPred('~/TM/Filtered_TM_data.csv','~/TM/Labels.csv','~/TM/CV_folds.RData','~/Results/TM/')\n```\n\nThis command will output the true and predicted cell labels as csv files, as well as the classifier computation time.\n\n#### Step 3\n\nEvaluate the classifier prediction by \n\n```R\nresult <- evaluate('~/Results/TM/scPred_True_Labels.csv', '~/Results/TM/scPred_Pred_Labels.csv')\n```\n\nThis command will return the corresponding accuracy, median F1-score, F1-scores for all cell populations, % unlabeled cells, and confusion matrix.\n\n### Usage with feature selection\n\n#### Step 1\n\nApply the ```Cross_Validation``` R function on the corresponding dataset to obtain fixed training and test cell indices, straitified across different cell types. For example, using the Tabula Muris (TM) dataset\n\n```R\nCross_Validation('~/TM/Labels.csv', 1, '~/TM/')\n```\n\nThis command will create a ```CV_folds.RData``` file used as input in Step 2 and 3.\n\n#### Step 2\n\nApply the ```rank_gene_dropouts``` Python script to get the genes ranking for each training fold using the dropout criteria\n\n```\nrank_gene_dropouts('~/TM/Filtered_TM_data.csv', '~/TM/CV_folds.RData', '~/TM/')\n```\n\nThis command will create a ```rank_genes_dropouts.csv``` file used as input in Step 3.\n\n#### Step 3\n\nRun each classifier wrapper. For example, running scPred on TM dataset with 1000 genes\n\n```R\nrun_scPred('~/TM/Filtered_TM_data.csv','~/TM/Labels.csv','~/TM/CV_folds.RData','~/Results/TM/',\nGeneOrderPath = '~/TM/rank_genes_dropouts.csv',NumGenes = 1000)\n```\n\nThis command will output the true and predicted cell labels as csv files, as well as the classifier computation time.\n\n#### Step 4\n\nEvaluate the classifier prediction by \n\n```R\nresult <- evaluate('~/Results/TM/scPred_True_Labels.csv', '~/Results/TM/scPred_Pred_Labels.csv')\n```\n\nThis command will return the corresponding accuracy, median F1-score, F1-scores for all cell populations, % unlabeled cells, and confusion matrix.\n\n### Evaluate Marker-based methods using DE genes\n\nTo evaluate the marker-based methods SCINA, DigitalCellSorter and Garnett using DE genes learned from the data, you may follow these steps:\n\n#### Step 1\n\nApply the ```Cross_Validation``` R function on the corresponding dataset to obtain fixed training and test cell indices, straitified across different cell types. For example, using the Zheng_sorted dataset\n\n```R\nCross_Validation('~/TM/Labels.csv', 1, '~/Zheng_sorted/')\n```\n\nThis command will create a ```CV_folds.RData``` file used as input in Step 2 and 3.\n\n#### Step 2\n\nFor each fold use the training data to get the DE genes using the ```DEgenesMAST``` R function, and pass these DE genes to the corresponding method, for example here we use SCINA, to obtain cell prediction for the test data.\n\n```R\nload('CV_folds.RData')\nData <- read.csv('~/Zheng_sorted/Filtered_DownSampled_SortedPBMC_data',row.names = 1)\nLabels <- as.matrix(read.csv('~/Zheng_sorted/Labels.csv'))\nLabels <- as.vector(Labels[,col_Index])\nData <- Data[Cells_to_Keep,]\nLabels <- Labels[Cells_to_Keep]\n\nfor (i in c(1:n_folds))\n{\n    MarkerGenes <-  DEgenesMAST(t(Data[Train_Idx[[i]],]), Labels[Train_Idx[[i]]], Normalize = TRUE, LogTransform = TRUE)\n    ## write the MarkerGenes into a marker genes file format, depending on the tested method, for example for SCINA\n    write.csv(MarkerGenes, 'MarkerGenes.csv')\n    ## run the SCINA wrapper using these DE marker genes\n    run_SCINA(Data[Test_Idx[[i]],], Labels[Test_Idx[[i]]], 'MarkerGenes.csv', '~/Results/Zheng_sorted/')\n}\n```\n\n### Snakemake\n\nTo support future extension of this benchmarking work with new classifiers and datasets, we provide a Snakemake workflow to automate the performed benchmarking analyses (https://github.com/tabdelaal/scRNAseq_Benchmark/tree/snakemake_and_docker).\n"
  },
  {
    "path": "Scripts/run_ACTINN.py",
    "content": "import os \r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nimport rpy2.robjects as robjects\r\n\r\ndef run_ACTINN(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0):\r\n    '''\r\n    run ACTINN\r\n    Wrapper script to run ACTINN on a benchmark dataset with 5-fold cross validation,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    '''\r\n    \r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n    col = col - 1 \r\n    test_ind = np.array(robjects.r['Test_Idx'])\r\n    train_ind = np.array(robjects.r['Train_Idx'])\r\n\r\n    # read the data\r\n    data = pd.read_csv(DataPath,index_col=0,sep=',')\r\n    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)\r\n    \r\n    labels = labels.iloc[tokeep]\r\n    data = data.iloc[tokeep]\r\n    \r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n    \r\n    # folder with results\r\n    os.chdir(OutputDir)\r\n    \r\n    tot=[]\r\n    truelab = []\r\n    pred = []\r\n\r\n    for i in range(np.squeeze(nfolds)):\r\n        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1\r\n        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1\r\n    \r\n        train=data.iloc[train_ind_i]\r\n        test=data.iloc[test_ind_i]\r\n        y_train=labels.iloc[train_ind_i]\r\n        y_test=labels.iloc[test_ind_i]\r\n        \r\n        if (NumGenes > 0):\r\n            feat_to_use = features.iloc[0:NumGenes,i]\r\n            train = train.iloc[:,feat_to_use]\r\n            test = test.iloc[:,feat_to_use]\r\n        \r\n        train = train.transpose()\r\n        test = test.transpose()\r\n        \r\n        train.to_csv(\"train.csv\")\r\n        test.to_csv(\"test.csv\")\r\n        y_train.to_csv(\"train_lab.csv\", header = False, index = True, sep = '\\t')\r\n        y_test.to_csv(\"test_lab.csv\", header = False, index = True, sep = '\\t')\r\n        \r\n        tm.sleep(60)\r\n            \r\n        os.system(\"python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_format.py -i train.csv -o train -f csv\")\r\n        os.system(\"python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_format.py -i test.csv -o test -f csv\")\r\n        \r\n        start = tm.time()\r\n        os.system(\"python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_predict.py -trs train.h5 -trl train_lab.csv -ts test.h5\")    \r\n        tot.append(tm.time()-start)\r\n        \r\n        tm.sleep(60)\r\n\r\n        truelab.extend(y_test.values)\r\n        predlabels = pd.read_csv('predicted_label.txt',header=0,index_col=None, sep='\\t', usecols = [1])            \r\n        pred.extend(predlabels.values)\r\n    \r\n            \r\n    truelab = pd.DataFrame(truelab)\r\n    pred = pd.DataFrame(pred)\r\n    tot_time = pd.DataFrame(tot)\r\n    \r\n    if (NumGenes == 0):  \r\n        truelab.to_csv(\"ACTINN_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"ACTINN_Pred_Labels.csv\", index = False)\r\n        tot_time.to_csv(\"ACTINN_Total_Time.csv\", index = False)\r\n    else:\r\n        truelab.to_csv(\"ACTINN_\" + str(NumGenes) + \"_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"ACTINN_\" + str(NumGenes) + \"_Pred_Labels.csv\", index = False)\r\n        tot_time.to_csv(\"ACTINN_\" + str(NumGenes) + \"_Total_Time.csv\", index = False)\r\n        \r\n        \r\n        \r\n        \r\n        \r\n        \r\n        \r\n"
  },
  {
    "path": "Scripts/run_CHETAH.R",
    "content": "run_CHETAH<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){\r\n  \"\r\n  run CHETAH\r\n  Wrapper script to run CHETAH on a benchmark dataset with 5-fold cross validation,\r\n  outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n  Parameters\r\n  ----------\r\n  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n  as row names and gene names as column names.\r\n  LabelsPath : Cell population annotations file path (.csv).\r\n  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n  OutputDir : Output directory defining the path of the exported file.\r\n  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n  defining the genes order for each cross validation fold, default is NULL.\r\n  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.\r\n  \"\r\n  \r\n  Data <- read.csv(DataPath,row.names = 1)\r\n  Labels <- as.matrix(read.csv(LabelsPath))\r\n  load(CV_RDataPath)\r\n  Labels <- as.vector(Labels[,col_Index])\r\n  Data <- Data[Cells_to_Keep,]\r\n  Labels <- Labels[Cells_to_Keep]\r\n  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n    GenesOrder = read.csv(GeneOrderPath)\r\n  }\r\n  \r\n  #############################################################################\r\n  #                                CHETAH                                     #\r\n  #############################################################################\r\n  library(CHETAH)\r\n  library(SingleCellExperiment)\r\n  True_Labels_CHETAH <- list()\r\n  Pred_Labels_CHETAH <- list()\r\n  Total_Time_CHETAH <- list()\r\n  Data = t(as.matrix(Data))\r\n  \r\n  for (i in c(1:n_folds)){\r\n    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n      sce <- SingleCellExperiment(assays = list(counts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), \r\n                                  colData = data.frame(celltypes = Labels[Train_Idx[[i]]]))\r\n      \r\n      sce_test <- SingleCellExperiment(assays = list(counts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), \r\n                                       colData = data.frame(celltypes = Labels[Test_Idx[[i]]]))\r\n      start_time <- Sys.time()\r\n      sce_test <- CHETAHclassifier(input = sce_test, ref_cells = sce, n_genes = NumGenes)\r\n      end_time <- Sys.time()\r\n    }\r\n    else{\r\n      sce <- SingleCellExperiment(assays = list(counts = Data[,Train_Idx[[i]]]), \r\n                                  colData = data.frame(celltypes = Labels[Train_Idx[[i]]]))\r\n      \r\n      sce_test <- SingleCellExperiment(assays = list(counts = Data[,Test_Idx[[i]]]), \r\n                                       colData = data.frame(celltypes = Labels[Test_Idx[[i]]]))\r\n      start_time <- Sys.time()\r\n      sce_test <- CHETAHclassifier(input = sce_test, ref_cells = sce)\r\n      end_time <- Sys.time()\r\n    }\r\n    \r\n    Total_Time_CHETAH[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n    \r\n    True_Labels_CHETAH[i] <- list(Labels[Test_Idx[[i]]])\r\n    Pred_Labels_CHETAH[i] <- list(sce_test$celltype_CHETAH)\r\n  }\r\n  True_Labels_CHETAH <- as.vector(unlist(True_Labels_CHETAH))\r\n  Pred_Labels_CHETAH <- as.vector(unlist(Pred_Labels_CHETAH))\r\n  Total_Time_CHETAH <- as.vector(unlist(Total_Time_CHETAH))\r\n  \r\n  setwd(OutputDir)\r\n  \r\n  if (!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n    write.csv(True_Labels_CHETAH,paste('CHETAH_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Pred_Labels_CHETAH,paste('CHETAH_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Total_Time_CHETAH,paste('CHETAH_',NumGenes,'_Total_Time.csv', sep = ''),row.names = FALSE)\r\n  }\r\n  else{\r\n    write.csv(True_Labels_CHETAH,'CHETAH_True_Labels.csv',row.names = FALSE)\r\n    write.csv(Pred_Labels_CHETAH,'CHETAH_Pred_Labels.csv',row.names = FALSE)\r\n    write.csv(Total_Time_CHETAH,'CHETAH_Total_Time.csv',row.names = FALSE)\r\n  }\r\n}\r\n"
  },
  {
    "path": "Scripts/run_CaSTLe.R",
    "content": "run_CaSTLe<-function(DataPath,LabelsPath,CV_RDataPath, OutputDir, GeneOrderPath = NULL, NumGenes = NULL){\r\n  \"\r\n  run CaSTLe\r\n  Wrapper script to run CaSTLe on a benchmark dataset with 5-fold cross validation,\r\n  outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n  Parameters\r\n  ----------\r\n  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n  as row names and gene names as column names.\r\n  LabelsPath : Cell population annotations file path (.csv).\r\n  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n  OutputDir : Output directory defining the path of the exported file.\r\n  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n  defining the genes order for each cross validation fold, default is NULL.\r\n  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.\r\n  \"\r\n  \r\n  Data <- read.csv(DataPath,row.names = 1)\r\n  Labels <- as.matrix(read.csv(LabelsPath))\r\n  load(CV_RDataPath)\r\n  Labels <- as.vector(Labels[,col_Index])\r\n  Data <- Data[Cells_to_Keep,]\r\n  Labels <- Labels[Cells_to_Keep]\r\n  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n    GenesOrder = read.csv(GeneOrderPath)\r\n  }\r\n  \r\n  #############################################################################\r\n  #                                CaSTLe                                     #\r\n  #############################################################################\r\n  library(igraph)\r\n  library(xgboost)\r\n  True_Labels_Castle <- list()\r\n  Pred_Labels_Castle <- list()\r\n  Training_Time_Castle <- list()\r\n  Testing_Time_Castle <- list()\r\n  \r\n  BREAKS=c(-1, 0, 1, 6, Inf)\r\n  nFeatures = 100\r\n  \r\n  for(i in c(1:n_folds)){\r\n    # 1. Load datasets\r\n    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n      ds1 = Data[Train_Idx[[i]],as.vector(GenesOrder[c(1:NumGenes),i])+1]\r\n      ds2 = Data[Test_Idx[[i]],as.vector(GenesOrder[c(1:NumGenes),i])+1]\r\n    }\r\n    else{\r\n      ds1 = Data[Train_Idx[[i]],]\r\n      ds2 = Data[Test_Idx[[i]],]\r\n    }\r\n    \r\n    sourceCellTypes = as.factor(Labels[Train_Idx[[i]]])\r\n    targetCellTypes = as.factor(Labels[Test_Idx[[i]]])\r\n    \r\n    start_time <- Sys.time()\r\n    # 2. Unify sets, excluding low expressed genes\r\n    source_n_cells_counts = apply(ds1, 2, function(x) { sum(x > 0) } )\r\n    target_n_cells_counts = apply(ds2, 2, function(x) { sum(x > 0) } )\r\n    common_genes = intersect( colnames(ds1)[source_n_cells_counts>10], \r\n                              colnames(ds2)[target_n_cells_counts>10])\r\n    remove(source_n_cells_counts, target_n_cells_counts)\r\n    ds1 = ds1[, colnames(ds1) %in% common_genes]\r\n    ds2 = ds2[, colnames(ds2) %in% common_genes]\r\n    ds = rbind(ds1[,common_genes], ds2[,common_genes])\r\n    isSource = c(rep(TRUE,nrow(ds1)), rep(FALSE,nrow(ds2)))\r\n    remove(ds1, ds2)\r\n    \r\n    # 3. Highest mean in both source and target\r\n    topFeaturesAvg = colnames(ds)[order(apply(ds, 2, mean), decreasing = T)]\r\n    end_time <- Sys.time()\r\n    Training_Time_Castle[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n    \r\n    start_time <- Sys.time()\r\n    # for each cell - what is the most probable classification?\r\n    L = length(levels(sourceCellTypes))\r\n    targetClassification = as.data.frame(matrix(rep(0,L*sum(!isSource)), nrow=L), row.names = levels(sourceCellTypes))\r\n    \r\n    for (cellType in levels(sourceCellTypes)) {\r\n      \r\n      inSourceCellType = as.factor(ifelse(sourceCellTypes == cellType, cellType, paste0(\"NOT\",cellType)))\r\n      \r\n      # 4. Highest mutual information in source\r\n      topFeaturesMi = names(sort(apply(ds[isSource,],2,function(x) { compare(cut(x,breaks=BREAKS),inSourceCellType,method = \"nmi\") }), decreasing = T))\r\n      \r\n      # 5. Top n genes that appear in both mi and avg\r\n      selectedFeatures = union(head(topFeaturesAvg, nFeatures) , head(topFeaturesMi, nFeatures) )\r\n      \r\n      # 6. remove correlated features\r\n      tmp = cor(ds[,selectedFeatures], method = \"pearson\")\r\n      tmp[!lower.tri(tmp)] = 0\r\n      selectedFeatures = selectedFeatures[apply(tmp,2,function(x) any(x < 0.9))]\r\n      remove(tmp)\r\n      \r\n      # 7,8. Convert data from continous to binned dummy vars\r\n      # break datasets to bins\r\n      dsBins = apply(ds[, selectedFeatures], 2, cut, breaks= BREAKS)\r\n      # use only bins with more than one value\r\n      nUniq = apply(dsBins, 2, function(x) { length(unique(x)) })\r\n      # convert to dummy vars\r\n      ds0 = model.matrix(~ . , as.data.frame(dsBins[,nUniq>1]))\r\n      remove(dsBins, nUniq)\r\n      \r\n      cat(paste0(\"<h2>Classifier for \",cellType,\"</h2>\"))\r\n      \r\n      inTypeSource = sourceCellTypes == cellType\r\n      # 9. Classify\r\n      xg=xgboost(data=ds0[isSource,] , \r\n                 label=inTypeSource,\r\n                 objective=\"binary:logistic\", \r\n                 eta=0.7 , nthread=1, nround=20, verbose=0,\r\n                 gamma=0.001, max_depth=5, min_child_weight=10)\r\n      \r\n      # 10. Predict\r\n      inTypeProb = predict(xg, ds0[!isSource, ])\r\n      \r\n      targetClassification[cellType,] = inTypeProb\r\n    }\r\n    end_time <- Sys.time()\r\n    Testing_Time_Castle[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n    \r\n    True_Labels_Castle[i] <- list(Labels[Test_Idx[[i]]])\r\n    Pred_Labels_Castle[i] <- list(rownames(targetClassification)[apply(targetClassification,2,which.max)])\r\n  }\r\n  True_Labels_Castle <- as.vector(unlist(True_Labels_Castle))\r\n  Pred_Labels_Castle <- as.vector(unlist(Pred_Labels_Castle))\r\n  Training_Time_Castle <- as.vector(unlist(Training_Time_Castle))\r\n  Testing_Time_Castle <- as.vector(unlist(Testing_Time_Castle))\r\n  \r\n  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n    write.csv(True_Labels_Castle,paste('True_Labels_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Pred_Labels_Castle,paste('Pred_Labels_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Training_Time_Castle,paste('Training_Time_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Testing_Time_Castle,paste('Testing_Time_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)\r\n  }\r\n  else{\r\n    write.csv(True_Labels_Castle,'True_Labels_CaSTLe.csv',row.names = FALSE)\r\n    write.csv(Pred_Labels_Castle,'Pred_Labels_CaSTLe.csv',row.names = FALSE)\r\n    write.csv(Training_Time_Castle,'Training_Time_CaSTLe.csv',row.names = FALSE)\r\n    write.csv(Testing_Time_Castle,'Testing_Time_CaSTLe.csv',row.names = FALSE)\r\n  }\r\n  \r\n}"
  },
  {
    "path": "Scripts/run_Cell_BLAST.py",
    "content": "import os\r\nimport time as tm\r\nimport pandas as pd\r\nimport warnings\r\nwarnings.filterwarnings(\"ignore\")\r\n\r\nimport tensorflow as tf\r\ntf.logging.set_verbosity(0)\r\n\r\nimport Cell_BLAST as cb\r\nimport numpy as np\r\nfrom numpy import genfromtxt as gft\r\nimport rpy2.robjects as robjects\r\n\r\n\r\ndef run_Cell_BLAST(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0):\r\n    '''\r\n    run Cell_BLAST\r\n    Wrapper script to run Cell_BLAST on a benchmark dataset with 5-fold cross validation,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    '''\r\n        \r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n    col = col - 1 \r\n    test_ind = np.array(robjects.r['Test_Idx'])\r\n    train_ind = np.array(robjects.r['Train_Idx'])\r\n    \r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n\r\n    # read the data and labels\r\n    data_old = cb.data.ExprDataSet.read_table(DataPath,orientation=\"cg\", sep=\",\", index_col = 0, header = 0, sparsify = True).normalize()\r\n    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)\r\n    \r\n    data = cb.data.ExprDataSet(data_old.exprs[tokeep],data_old.obs.iloc[tokeep],data_old.var,data_old.uns)\r\n\r\n    labels = gft(LabelsPath, dtype = \"str\", skip_header = 1, delimiter = \",\", usecols = col)      \r\n    labels = labels[tokeep]\r\n\r\n    os.chdir(OutputDir)\r\n    \r\n    truelab = []\r\n    pred = []\r\n    tr_time = []\r\n    ts_time = []\r\n    \r\n    for i in range(np.squeeze(nfolds)):\r\n        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1\r\n        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1\r\n\r\n        train=data[train_ind_i,:]\r\n        test=data[test_ind_i,:]\r\n        y_train = labels[train_ind_i]\r\n        y_test = labels[test_ind_i]\r\n        \r\n        if (NumGenes > 0):\r\n            feat_to_use = features.iloc[0:NumGenes,i]\r\n            train = train[:,feat_to_use]\r\n            test = test[:,feat_to_use]\r\n\r\n        \r\n        train.obs['cell_type'] = y_train\r\n                \r\n        start = tm.time()\r\n                \r\n        # reduce dimensions\r\n        num_epoch = 50\r\n        models = []\r\n    \r\n        for j in range(4):\r\n            models.append(cb.directi.fit_DIRECTi(train, epoch=num_epoch, patience=10, random_seed = j, path=\"%d\" % j))\r\n    \r\n        # train model\r\n        blast = cb.blast.BLAST(models, train).build_empirical()\r\n        tr_time.append(tm.time()-start)\r\n        \r\n        # predict labels\r\n        start = tm.time()\r\n        test_pred = blast.query(test).annotate('cell_type')\r\n        ts_time.append(tm.time()-start)\r\n\r\n        truelab.extend(y_test)\r\n        pred.extend(test_pred.values)\r\n    \r\n    #write results    \r\n    truelab = pd.DataFrame(truelab)\r\n    pred = pd.DataFrame(pred)\r\n            \r\n    tr_time = pd.DataFrame(tr_time)\r\n    ts_time = pd.DataFrame(ts_time)\r\n    \r\n    if (NumGenes == 0):  \r\n        truelab.to_csv(\"Cell_BLAST_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"Cell_BLAST_Pred_Labels.csv\", index = False)\r\n        tr_time.to_csv(\"Cell_BLAST_Training_Time.csv\", index = False)\r\n        ts_time.to_csv(\"Cell_BLAST_Testing_Time.csv\", index = False)\r\n    else:\r\n        truelab.to_csv(\"Cell_BLAST_\" + str(NumGenes) + \"_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"Cell_BLAST_\" + str(NumGenes) + \"_Pred_Labels.csv\", index = False)\r\n        tr_time.to_csv(\"Cell_BLAST_\" + str(NumGenes) + \"_Training_Time.csv\", index = False)\r\n        ts_time.to_csv(\"Cell_BLAST_\" + str(NumGenes) + \"_Testing_Time.csv\", index = False)\r\n        \r\n"
  },
  {
    "path": "Scripts/run_DigitalCellSorter.py",
    "content": "import numpy as np\r\nimport pandas as pd\r\nimport scripts.DigitalCellSorter as DigitalCellSorter\r\nimport os\r\nimport time as tm\r\nimport rpy2.robjects as robjects\r\n\r\ndef run_DigitalCellSorter(DataPath, LabelsPath, CV_RDataPath, GeneListPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0):\r\n    '''\r\n    run DigitalCellSorter\r\n    Wrapper script to run DigitalCellSorter on a benchmark dataset using a predefined genelist,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.  \r\n  \r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    GeneListPath : Data file path to the genest.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    '''\r\n        \r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n    col = col - 1\r\n    \r\n    # read the data\r\n    data = pd.read_csv(DataPath,index_col=0,sep=',')\r\n    data = data.iloc[tokeep]\r\n    \r\n    truelab = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)\r\n    truelab = truelab.iloc[tokeep]\r\n\r\n\r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n        feat_to_use = features.iloc[0:NumGenes,0]\r\n        data = data.iloc[:,feat_to_use]\r\n        \r\n    data = data.transpose()\r\n    \r\n    # number of different cell types in the data?\r\n    n_clusters = 8\r\n    AvailableCPUsCount = 1\r\n    N_samples_for_distribution = 10000\r\n        \r\n    start = tm.time()\r\n    pred = DigitalCellSorter.DigitalCellSorter().Process(data, 'DigitalCellSorter_Zhang', \r\n                                                saveDir = OutputDir, \r\n                                                geneListFileName = GeneListPath,\r\n                                                N_samples_for_distribution = N_samples_for_distribution,\r\n                                                AvailableCPUsCount = AvailableCPUsCount,\r\n                                                clusterIndex=None,\r\n                                                clusterName=None,\r\n                                                n_clusters=n_clusters)\t\r\n    runtime = tm.time() - start \r\n    \r\n    os.chdir(OutputDir)\r\n    \r\n    results = pd.read_excel('DigitalCellSorter_Zhang_voting.xlsx',header=0,index_col=None, usecols=[11])\r\n\r\n    prediction = np.zeros(np.shape(pred), dtype='>U10')\r\n    \r\n    for i in range(len(results)):\r\n    \tprediction[np.where(pred == i)] = results.values[i]\r\n    \r\n    prediction = pd.DataFrame(prediction)\r\n        \r\n    if (NumGenes == 0):  \r\n        truelab.to_csv(\"DigitalCellSorter_True_Labels.csv\", index = False)\r\n        prediction.to_csv(\"DigitalCellSorter_Pred_Labels.csv\", index = False)\r\n        with open(\"DigitalCellSorter_Total_Time.csv\", 'w') as f:\r\n            f.write(\"%f\\n\" % runtime)\r\n    else:\r\n        truelab.to_csv(\"DigitalCellSorter_\" + str(NumGenes) + \"_True_Labels.csv\", index = False)\r\n        prediction.to_csv(\"DigitalCellSorter_\" + str(NumGenes) + \"_Pred_Labels.csv\", index = False)\r\n        with open(\"DigitalCellSorter_\" + str(NumGenes) + \"_Total_Time.csv\", 'w') as f:\r\n            f.write(\"%f\\n\" % runtime)\r\n\r\n            \r\n\r\n        "
  },
  {
    "path": "Scripts/run_Garnett_CV.R",
    "content": "run_Garnett_CV <- function(DataPath, LabelsPath, CV_RDataPath, GenesPath, MarkerPath, OutputDir, Human){\r\n  \"\r\n  run Garnett\r\n  Wrapper script to run Garnett on a benchmark dataset with 5-fold cross validation,\r\n  outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n  Parameters\r\n  ----------\r\n  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n  as row names and gene names as column names.\r\n  LabelsPath : Cell population annotations file path (.csv).\r\n  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n  GenesPath : Path to the file with the genenames\r\n  MarkerPath : Path to the file with marker genes\r\n  OutputDir : Output directory defining the path of the exported file.\r\n  Human : boolean indicating whether the dataset is human (TRUE) or mouse (FALSE)\r\n  \"\r\n\r\n  # load needed libraries\r\n  library(garnett)\r\n  if (Human) {\r\n    library(org.Hs.eg.db)\r\n  } else {\r\n    library(org.Mm.eg.db)\r\n  }\r\n  \r\n  # load the CVFile\r\n  load(CV_RDataPath)\r\n  \r\n  # read the labels\r\n  labels <- as.matrix(read.csv(LabelsPath))\r\n  labels <- as.vector(labels[,col_Index])\r\n  labels <- labels[Cells_to_Keep]\r\n  \r\n  # read the data\r\n  mat <- read.table(DataPath, sep = \",\")\r\n  data <- mat[-1,-1]\r\n  data <- data[Cells_to_Keep,]\r\n  data <- t(data) #ensure that the genes are rows, and the cells are columns\r\n  \r\n  cells <- mat[-1,1]\r\n  cells <- cells[Cells_to_Keep]\r\n  \r\n  # read the genefile \r\n  fdata <- read.table(GenesPath)\r\n  names(fdata) <- 'gene_short_name'\r\n  row.names(fdata) <- fdata$gene_short_name\r\n  fd <- new(\"AnnotatedDataFrame\", data = fdata)\r\n  \r\n  true_labels <- list()\r\n  pred_labels <- list()\r\n  train_time <- list()\r\n  test_time <- list()\r\n  \r\n  for (i in c(1:n_folds)){\r\n    lab_train = labels[Train_Idx[[i]]]\r\n    lab_test = labels[Test_Idx[[i]]]\r\n    \r\n    train = data[,Train_Idx[[i]]]\r\n    test = data[,Test_Idx[[i]]]\r\n    \r\n    cells_train = cells[Train_Idx[[i]]]\r\n    cells_test = cells[Test_Idx[[i]]]\r\n    \r\n    pdata_train = data.frame(cells_train)\r\n    pdata_test = data.frame(cells_test)\r\n    \r\n    row.names(train) <- row.names(fdata)\r\n    row.names(test) <- row.names(fdata)\r\n    colnames(train) <- row.names(pdata_train)\r\n    colnames(test) <- row.names(pdata_test)\r\n    \r\n    pd_train <- new(\"AnnotatedDataFrame\", data = pdata_train)\r\n    pd_test <- new(\"AnnotatedDataFrame\", data = pdata_test)\r\n    \r\n    pbmc_cds_train <- newCellDataSet(as(train, \"dgCMatrix\"), phenoData = pd_train, featureData = fd)\r\n    pbmc_cds_test <- newCellDataSet(as(test, \"dgCMatrix\"), phenoData = pd_test, featureData = fd)\r\n    \r\n    pbmc_cds_train <- estimateSizeFactors(pbmc_cds_train)\r\n    pbmc_cds_test <- estimateSizeFactors(pbmc_cds_test)\r\n    \r\n    # training\r\n    start_train <- Sys.time()\r\n    \r\n    if (Human){\r\n      pbmc_classifier <- train_cell_classifier(cds = pbmc_cds_train, \r\n                                               marker_file = MarkerPath,\r\n                                               db=org.Hs.eg.db,\r\n                                               cds_gene_id_type = \"SYMBOL\",\r\n                                               num_unknown = 50,\r\n                                               marker_file_gene_id_type = \"SYMBOL\")\r\n    } else {\r\n      pbmc_classifier <- train_cell_classifier(cds = pbmc_cds_train, \r\n                                               marker_file = MarkerPath,\r\n                                               db=org.Mm.eg.db,\r\n                                               cds_gene_id_type = \"SYMBOL\",\r\n                                               num_unknown = 50,\r\n                                               marker_file_gene_id_type = \"SYMBOL\")\r\n      \r\n    }\r\n    end_train <- Sys.time()\r\n    train_time[i] <- as.numeric(end_train - start_train)\r\n    \r\n    # testing\r\n    start_test <- Sys.time()\r\n    \r\n    if (Human) {\r\n      pbmc_cds_test <- classify_cells(pbmc_cds_test, \r\n                                      pbmc_classifier, \r\n                                      db = org.Hs.eg.db, \r\n                                      cluster_extend = TRUE,\r\n                                      cds_gene_id_type = \"SYMBOL\")\r\n    } else {\r\n      pbmc_cds_test <- classify_cells(pbmc_cds_test, \r\n                                      pbmc_classifier, \r\n                                      db = org.Mm.eg.db, \r\n                                      cluster_extend = TRUE,\r\n                                      cds_gene_id_type = \"SYMBOL\")\r\n    }\r\n    end_test <- Sys.time()\r\n    test_time[i] <- as.numeric(end_test - start_test)\r\n    \r\n    true_labels[i] <- list(lab_test)\r\n    pred_labels[i] <- list(pData(pbmc_cds_test)$cluster_ext_type)\r\n    \r\n    \r\n  }\r\n  \r\n  true_labels <- as.vector(unlist(true_labels))\r\n  pred_labels <- as.vector(unlist(pred_labels))\r\n  train_time <- as.vector(unlist(train_time))\r\n  test_time <- as.vector(unlist(test_time))\r\n  \r\n  setwd(OutputDir)\r\n  \r\n  write.csv(train_time,'Garnett_CV_Testing_Time.csv',row.names = FALSE)\r\n  write.csv(test_time,'Garnett_CV_Training_Time.csv',row.names = FALSE)\r\n  write.csv(true_labels, 'Garnett_CV_True_Labels.csv', row.names = FALSE)\r\n  write.csv(pred_labels, 'Garnett_CV_Pred_Labels.csv', row.names = FALSE)\r\n  \r\n  \r\n}"
  },
  {
    "path": "Scripts/run_Garnett_Pretrained.R",
    "content": "run_Garnett_Pretrained <- function(DataPath, LabelsPath, GenesPath, CV_RDataPath, ClassifierPath, OutputDir, Human){\r\n  \"\r\n  run Garnett\r\n  Wrapper script to run Garnett on a benchmark dataset with a pretrained classifier,\r\n  outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n  Parameters\r\n  ----------\r\n  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n  as row names and gene names as column names.\r\n  LabelsPath : Cell population annotations file path (.csv).\r\n  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n  GenesPath : Path to the file with the genenames\r\n  ClassifierPath : Path to the pretrained classifier\r\n  OutputDir : Output directory defining the path of the exported file.\r\n  Human : boolean indicating whether the dataset is human (TRUE) or mouse (FALSE)\r\n  \"\r\n  # load needed libraries\r\n  library(garnett)\r\n  \r\n  if (Human) {\r\n    library(org.Hs.eg.db)\r\n  } else {\r\n    library(org.Mm.eg.db)\r\n  }\r\n  \r\n  # load data, genes, and marker file\r\n  load(CV_RDataPath)\r\n  \r\n  load(ClassifierPath)\r\n  \r\n  labels <- as.matrix(read.csv(LabelsPath))\r\n  labels <- labels[Cells_to_Keep]\r\n  \r\n  mat <- read.table(DataPath, sep = \",\")\r\n  data <- mat[-1,-1]\r\n  data <- data[Cells_to_Keep,]\r\n  data <- t(data) #ensure that the genes are rows, and the cells are columns\r\n  \r\n  barcodes <- mat[-1,1]\r\n  \r\n  pdata = data.frame(barcodes)\r\n  fdata <- read.table(GenesPath)\r\n  names(fdata) <- 'gene_short_name'\r\n  row.names(fdata) <- fdata$gene_short_name\r\n  \r\n  row.names(data) <- row.names(fdata)\r\n  colnames(data) <- row.names(pdata)\r\n  \r\n  pd <- new(\"AnnotatedDataFrame\", data = pdata)\r\n  fd <- new(\"AnnotatedDataFrame\", data = fdata)\r\n  pbmc_cds <- newCellDataSet(as(data, \"dgCMatrix\"),\r\n                             phenoData = pd,\r\n                             featureData = fd)\r\n  \r\n  start_time <- Sys.time()\r\n  \r\n  pbmc_cds <- estimateSizeFactors(pbmc_cds)\r\n  \r\n  if (Human){\r\n    pbmc_cds <- classify_cells(pbmc_cds, hsPBMC, db = org.Hs.eg.db, cluster_extend = TRUE, cds_gene_id_type = \"SYMBOL\")\r\n  } else {\r\n    pbmc_cds <- classify_cells(pbmc_cds, mmLung, db = org.Mm.eg.db, cluster_extend = TRUE, cds_gene_id_type = \"SYMBOL\")\r\n  }\r\n  \r\n  end_time <- Sys.time()\r\n  \r\n  test_time <- as.numeric(end_time - start_time)\r\n  \r\n  setwd(OutputDir)\r\n  \r\n  write.table(pData(pbmc_cds)$cluster_ext_type, file = \"Garnett_Pred_Labels.csv\", append = FALSE, quote = TRUE, sep = \"\\t\",\r\n              eol = \"\\n\", na = \"NA\", dec = \".\", row.names = FALSE,\r\n              qmethod = c(\"escape\", \"double\"),\r\n              fileEncoding = \"\")\r\n  \r\n  write.csv(labels,\"Garnett_Pretrained_True_Labels.csv\", row.names = FALSE)\r\n  \r\n  write.csv(test_time,'Garnett_Pretrained_Testing_Time.csv',row.names = FALSE)\r\n  \r\n  \r\n  \r\n}"
  },
  {
    "path": "Scripts/run_LAmbDA.py",
    "content": "# -*- coding: utf-8 -*-\r\n\"\"\"\r\nCreated on Thu May 23 13:51:15 2019\r\n\r\n@author: Lieke\r\n\"\"\"\r\n\r\nimport os \r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nimport rpy2.robjects as robjects\r\nimport tensorflow as tf\r\nimport math\r\nimport scipy.io as sio\r\nimport optunity as opt\r\nfrom tensorflow.contrib.tensor_forest.python import tensor_forest\r\nfrom tensorflow.python.ops import resources\r\n\r\n\r\ndef run_LAmbDA(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0):\r\n    '''\r\n    run LAmbDA classifier\r\n    Wrapper script to run LAmbDA on a benchmark dataset with 5-fold cross validation,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    '''\r\n        \r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n    col = col - 1 \r\n    test_ind = np.array(robjects.r['Test_Idx'])\r\n    train_ind = np.array(robjects.r['Train_Idx'])\r\n\r\n    # read the data\r\n    data = pd.read_csv(DataPath,index_col=0,sep=',')\r\n    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)\r\n    \r\n    labels = labels.iloc[tokeep]\r\n    data = data.iloc[tokeep]\r\n    \r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n    \r\n    # folder with results\r\n    os.chdir(OutputDir)\r\n                \r\n    tr_time=[]\r\n    ts_time=[]\r\n    truelab = np.zeros([len(labels),1],dtype = int)\r\n    predlab = np.zeros([len(labels),1],dtype = int)\r\n        \r\n    for i in range(np.squeeze(nfolds)):\r\n        global X, Y, Gnp, Dnp, train, test, prt, cv\r\n        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1\r\n        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1\r\n                \r\n        X = np.array(data) \r\n        if (NumGenes > 0):\r\n            X = np.log2(X/10+1)\r\n            feat_to_use = features.iloc[0:NumGenes,i]\r\n            X = X[:,feat_to_use]\r\n        else:\r\n            X = np.log2(np.transpose(select_feats(np.transpose(X),0.5,80))/10+1)\r\n    \r\n        uniq = np.unique(labels)\r\n        Y = np.zeros([len(labels),len(uniq)],int)\r\n        \r\n        for j in range(len(uniq)):\r\n            Y[np.where(labels == uniq[j])[0],j] = 1\r\n    \r\n        Y = np.array(Y)\r\n        \r\n        Gnp = np.zeros([len(uniq),len(uniq)],int)\r\n        np.fill_diagonal(Gnp,1)\r\n        Gnp = np.array(Gnp)\r\n        \r\n        Dnp = np.ones([len(uniq),1],int)\r\n        Dnp = np.array(Dnp)\r\n        \r\n        train_samp = int(np.floor(0.75*len(train_ind_i)))\r\n        test_samp = len(train_ind_i) - train_samp\r\n        perm = np.random.permutation(len(train_ind_i))\r\n        train = perm[0:train_samp]\r\n        test = perm[train_samp:test_samp+1]\r\n        \r\n        while(np.sum(np.sum(Y[train,:],0)<5)>0):\r\n            perm = np.random.permutation(X.shape[0])\r\n            train = perm[0:train_samp+1]\r\n            test = perm[train_samp+1:train_samp+test_samp+1]\r\n        \r\n        cv = i\r\n        optunity_it = 0\r\n        prt = False\r\n        opt_params = None\r\n                    \r\n        start=tm.time()\r\n        opt_params, _, _ = opt.minimize(run_LAmbDA2,solver_name='sobol', gamma=[0.8,1.2], delta=[0.05,0.95], tau=[10.0,11.0], prc_cut=[20,50], bs_prc=[0.2,0.6], num_trees=[10,200], max_nodes=[100,1000], num_evals=50)\r\n        tr_time.append(tm.time()-start)\r\n        \r\n        print(\"Finished training!\")\r\n        \r\n        prt = True\r\n        train = train_ind_i\r\n        test = test_ind_i\r\n        \r\n        start=tm.time()\r\n        err = run_LAmbDA2(opt_params['gamma'], opt_params['delta'], opt_params['tau'], opt_params['prc_cut'], opt_params['bs_prc'], opt_params['num_trees'], opt_params['max_nodes'])\r\n        ts_time.append(tm.time()-start)\r\n        \r\n        tf.reset_default_graph();\r\n        \r\n        predfile = 'preds_cv' + str(cv) + '.mat'\r\n        truefile = 'truth_cv' + str(cv) + '.mat'\r\n        pred = sio.loadmat(predfile)\r\n        truth = sio.loadmat(truefile)\r\n        \r\n        pred = pred['preds']\r\n        truth = truth['labels']\r\n        \r\n        pred_ind = np.argmax(pred,axis=1)\r\n        truth_ind = np.argmax(truth,axis=1)\r\n        \r\n        predlab[test_ind_i,0] = pred_ind\r\n        truelab[test_ind_i,0] = truth_ind\r\n            \r\n                \r\n    truelab = pd.DataFrame(truelab)\r\n    predlab = pd.DataFrame(predlab)\r\n        \r\n    tr_time = pd.DataFrame(tr_time)\r\n    ts_time = pd.DataFrame(ts_time)\r\n        \r\n    if (NumGenes == 0):  \r\n        truelab.to_csv(\"LAmbDA_True_Labels.csv\", index = False)\r\n        predlab.to_csv(\"LAmbDA_Pred_Labels.csv\", index = False)\r\n        tr_time.to_csv(\"LAmbDA_Training_Time.csv\", index = False)\r\n        ts_time.to_csv(\"LAmbDA_Testing_Time.csv\", index = False)\r\n    else:\r\n        truelab.to_csv(\"LAmbDA_\" + str(NumGenes) + \"_True_Labels.csv\", index = False)\r\n        predlab.to_csv(\"LAmbDA_\" + str(NumGenes) + \"_Pred_Labels.csv\", index = False)\r\n        tr_time.to_csv(\"LAmbDA_\" + str(NumGenes) + \"_Training_Time.csv\", index = False)\r\n        ts_time.to_csv(\"LAmbDA_\" + str(NumGenes) + \"_Testing_Time.csv\", index = False)\r\n\r\n\r\n##### Functions copied from LAmbDA's Github\r\ndef wt_cutoff(colnum,cutoff,Gtmp,gamma):\r\n\trowsums = np.sum(Gtmp,axis=1);\r\n\treturn(math.ceil(cutoff*(math.log((max(rowsums)/rowsums[colnum])+1,2)**gamma)))\r\n\r\ndef resample(prc_cut,Y,Gtmp,train,gamma):\r\n\tadd = list()\r\n\trem = list()\r\n\tcolsums = np.sum(Y[train,:],axis=0);\r\n\tcutoff = math.ceil(np.percentile(colsums,prc_cut));\r\n\tfor i in range(len(colsums)):\r\n\t\tif colsums[i] == 0:\r\n\t\t\tpass\r\n\t\telif colsums[i] < wt_cutoff(i,cutoff,Gtmp,gamma):\r\n\t\t\tidx = np.squeeze(np.array(np.where(Y[train,i]>=1)));\r\n\t\t\tchoice = np.random.choice(train[idx],int(wt_cutoff(i,cutoff,Gtmp,gamma)-colsums[i]))\r\n\t\t\tadd = add + choice.tolist();\r\n\t\telif colsums[i] == wt_cutoff(i,cutoff,Gtmp,gamma):\r\n\t\t\tpass\r\n\t\telse:\r\n\t\t\tidx = np.squeeze(np.array(np.where(Y[train,i]>=1)));\r\n\t\t\tchoice = np.random.choice(train[idx],int(colsums[i]-wt_cutoff(i,cutoff,Gtmp,gamma)),replace=False)\r\n\t\t\trem = rem + choice.tolist()\r\n\treturn np.concatenate((list([val for val in train if val not in rem]),add));\r\n\r\ndef select_feats(Xtmp,num_zero_prc_cut,var_prc_cut):\r\n\t#*********************************************************************\r\n\t# remove features with many zeros\r\n\tnum_feat_zeros = np.sum(Xtmp==0,axis=1);\r\n\tXtmp = Xtmp[num_feat_zeros<num_zero_prc_cut*Xtmp.shape[1],:]\r\n\t#*********************************************************************\r\n\t# remove features with low variance\r\n\tfeat_vars = np.var(Xtmp,axis=1)\r\n\tXtmp = Xtmp[feat_vars>np.percentile(feat_vars,var_prc_cut),:]\r\n\treturn(Xtmp)\r\n\r\ndef get_yn(predict,ys,delta,tau,output_feats):\r\n\tD = tf.cast(Dnp, tf.float32);\r\n\tG = tf.cast(Gnp, tf.float32);\r\n\tys = tf.cast(ys, tf.float32);\r\n\t#print(\"start\")\r\n\tCm = tf.matmul(tf.transpose(tf.matmul(ys,D)),predict+0.1)/tf.reshape(tf.reduce_sum(tf.transpose(tf.matmul(ys,D)),1),(-1,1));\r\n\t#print(\"1\")\r\n\tmCm = tf.reshape(tf.reduce_mean(tf.cast(tf.matmul(tf.transpose(D),G)>0,tf.float32)*Cm,1),(-1,1));\r\n\t#print(\"2\")\r\n\tyw = tf.multiply(predict+0.1,tf.matmul(tf.matmul(ys,D),tf.pow(mCm/Cm,tau)));\r\n\t#print(\"3\")\r\n\tye = tf.multiply(tf.matmul(ys,G),yw);\r\n\t#print(\"4\")\r\n\tyt = tf.matmul(ys,tf.matmul(tf.transpose(ys),ye));\r\n\t#print(\"5\")\r\n\tya = (delta*yt)+((1-delta)*ye)\r\n\t#print(\"6\")\r\n\tyn = tf.cast(tf.one_hot(tf.argmax(ya,axis=1),output_feats), dtype=tf.float32)\r\n\t#print(\"7\")\r\n\treturn(yn)\r\n\r\ndef get_yi(rowsums,G2,ys):\r\n\tG2 = tf.cast(G2, tf.float32);\r\n\tys = tf.cast(ys, tf.float32);\r\n\tyi = tf.cast(tf.matmul(ys,G2), dtype=tf.float32);\r\n\treturn(yi)\r\n\r\ndef run_LAmbDA2(gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes):\r\n\tglobal X, Y, Gnp, Dnp, train, test, prt, cv\r\n\tD = tf.cast(Dnp, tf.float32);\r\n\tG = tf.cast(Gnp, tf.float32);\r\n\t#optunity_it = optunity_it+1;\r\n\tnum_trees = int(num_trees);\r\n\tmax_nodes = int(max_nodes);\r\n\tprc_cut = int(np.ceil(prc_cut));\r\n\tprint(\"gamma=%.4f, delta=%.4f, tau=%.4f, prc_cut=%i, bs_prc=%.4f, num_trees=%i, max_nodes=%i\" % (gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes))\r\n\tinput_feats = X.shape[1];\r\n\tnum_labls = G.shape.as_list();\r\n\toutput_feats = num_labls[1];\r\n\t#print(output_feats)\r\n\tnum_labls = num_labls[0];\r\n\trowsums = np.sum(Gnp,axis=1);\r\n\ttrain2 = resample(prc_cut, Y, Gnp, train, gamma);\t\t\t\t# Bug??\r\n\tbs = int(np.ceil(bs_prc*train2.size))\r\n\txs = tf.placeholder(tf.float32, [None,input_feats])\r\n\t#ys = tf.placeholder(tf.float32, [None,num_labls])\r\n\tyin = tf.placeholder(tf.int32, [None])\r\n\tprint(\"Vars loaded xs and ys created\")\r\n\thparams = tensor_forest.ForestHParams(num_classes=output_feats,\r\n\t\t\t\t\t\t\t\t\tnum_features=input_feats,\r\n\t\t\t\t\t\t\t\t\tnum_trees=num_trees,\r\n\t\t\t\t\t\t\t\t\tmax_nodes=max_nodes).fill()\r\n\tprint(\"Tensor forest hparams created\")\t\t\t\t\t\t\t\t\r\n\tforest_graph = tensor_forest.RandomForestGraphs(hparams)\r\n\tprint(\"Tensor forest graph created\")\r\n\ttrain_op = forest_graph.training_graph(xs, yin)\r\n\tloss_op = forest_graph.training_loss(xs, yin)\r\n\tprint(\"Loss and train ops created\")\r\n\tpredict, _, _ = forest_graph.inference_graph(xs)\r\n\tprint(\"Tensor forest variables created through predict\")\r\n\taccuracy_op = tf.reduce_mean(tf.reduce_sum(tf.square(tf.one_hot(yin,output_feats)-predict),reduction_indices=[1]))\r\n\tprint(tf.reduce_sum(tf.square(tf.one_hot(yin,output_feats)-predict),reduction_indices=[1]))\r\n\t#predict = tf.one_hot(pred);\r\n\tprint(\"Lambda specific variables created\")\r\n\t# Creating training and testing steps\r\n\tG2 = np.copy(Gnp);\r\n\tG2[rowsums>1,:] = 0;\r\n\tYI = np.matmul(Y,G2);\r\n\tYIrs = np.sum(YI,axis=1);\r\n\ttrainI = train2[np.in1d(train2,np.where(YIrs==1))];\r\n\tprint(\"data type trainI,\",trainI.dtype)\r\n\ttestI = test[np.in1d(test,np.where(YIrs==1))];\r\n\tprint(\"trainI testI created\")\r\n\t#init_vars=tf.global_variables_initializer()\r\n\tinit_vars = tf.group(tf.global_variables_initializer(),\r\n\tresources.initialize_resources(resources.shared_resources()))\r\n\tsess = tf.Session()\r\n\tsess.run(init_vars)\r\n\tprint(\"Session started\")\r\n\t#beep = sess.run(predict,feed_dict={xs:X[1:100,:]});\r\n\t#beep = sess.run(predict,feed_dict={xs:X[train2[0:bs],:]});\r\n\ttensor_trainI = {xs: X[trainI, :], yin: sess.run(tf.argmax(get_yi(rowsums,G2,Y[trainI, :]),axis=1))}\r\n\tprint(\"tensor_trainI made\")\r\n\ttensor_testI = {xs: X[testI, :], yin: sess.run(tf.argmax(get_yi(rowsums,G2,Y[testI, :]),axis=1))}\r\n\tprint(\"tensor_testI made\")\r\n\ttensor_train = {xs: X[train2[0:bs], :], yin: sess.run(tf.argmax(get_yn(sess.run(predict,feed_dict={xs:X[train2[0:bs],:]}),Y[train2[0:bs], :],delta,tau,output_feats),axis=1))}\r\n\tprint(\"tensor_train made\")\r\n\ttensor_test = {xs: X[test, :], yin: sess.run(tf.argmax(get_yn(sess.run(predict,feed_dict={xs:X[test,:]}),Y[test, :],delta,tau,output_feats),axis=1))}\r\n\tprint(\"tensor_test made\")\r\n\t#**********************************\r\n\t#print(\"Loss and training steps created with sample tensors\")\r\n\t# Setting params and initializing\r\n\tprint(\"Beginning iterations\")\r\n\t# Starting training iterations\r\n\tprint(X.shape)\r\n\tfor i in range(1,101):\r\n\t\tif i < 50:\r\n\t\t\tsess.run(train_op, feed_dict=tensor_trainI)\r\n\t\t\t#print(\"ran train op\")\r\n\t\t\tif i % 10 == 0:\r\n\t\t\t\tprint(str(sess.run(accuracy_op, feed_dict=tensor_trainI)) + ' ' + str(sess.run(accuracy_op, feed_dict=tensor_testI)))\r\n\t\telse:\r\n\t\t\tsess.run(train_op, feed_dict=tensor_train)\r\n\t\t\tif i % 10 == 0:\r\n\t\t\t\tprint(str(sess.run(accuracy_op, feed_dict=tensor_train)) + ' ' + str(sess.run(accuracy_op, feed_dict=tensor_test)))\r\n\t\t\telif i % 10 == 0:\r\n\t\t\t\tnp.random_shuffle(train2);\r\n\t\t\t\ttensor_train = {xs: X[train2[0:bs], :], yin: sess.run(get_yn(sess.run(predict,feed_dict={xs:X[train2[0:bs],:]}),Y[train2[0:bs], :],delta,tau,output_feats))}\r\n\tif prt:\r\n\t\tblah = sess.run(predict, feed_dict=tensor_test);\r\n\t\tsio.savemat('preds_cv' + str(cv) + '.mat', {'preds': blah});\r\n\t\tsio.savemat('truth_cv' + str(cv) + '.mat', {'labels': Y[test, :]});\r\n\tacc = sess.run(accuracy_op, feed_dict=tensor_test) \r\n\tprint(\"loss1=%.4f, gamma=%.4f, delta=%.4f, tau=%.4f, prc_cut=%i, bs_prc=%.4f, num_trees=%i, max_nodes=%i\" % (acc, gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes))\r\n\ttf.reset_default_graph();\r\n\treturn(acc)\r\n"
  },
  {
    "path": "Scripts/run_LDA.py",
    "content": "import os \r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\r\nimport rpy2.robjects as robjects\r\n\r\n\r\ndef run_LDA(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0):\r\n    '''\r\n    run baseline classifier: LDA\r\n    Wrapper script to run LDA classifier on a benchmark dataset with 5-fold cross validation,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    '''\r\n        \r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n    col = col - 1 \r\n    test_ind = np.array(robjects.r['Test_Idx'])\r\n    train_ind = np.array(robjects.r['Train_Idx'])\r\n\r\n    # read the data\r\n    data = pd.read_csv(DataPath,index_col=0,sep=',')\r\n    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)\r\n    \r\n    labels = labels.iloc[tokeep]\r\n    data = data.iloc[tokeep]\r\n    \r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n    \r\n    # folder with results\r\n    os.chdir(OutputDir)\r\n    \r\n    # normalize data\r\n    data = np.log1p(data)\r\n        \r\n    Classifier = LinearDiscriminantAnalysis()\r\n            \r\n    tr_time=[]\r\n    ts_time=[]\r\n    truelab = []\r\n    pred = []\r\n        \r\n    for i in range(np.squeeze(nfolds)):\r\n        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1\r\n        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1\r\n    \r\n        train=data.iloc[train_ind_i]\r\n        test=data.iloc[test_ind_i]\r\n        y_train=labels.iloc[train_ind_i]\r\n        y_test=labels.iloc[test_ind_i]\r\n            \r\n        if (NumGenes > 0):\r\n            feat_to_use = features.iloc[0:NumGenes,i]\r\n            train = train.iloc[:,feat_to_use]\r\n            test = test.iloc[:,feat_to_use]\r\n            \r\n        start=tm.time()\r\n        Classifier.fit(train, y_train)\r\n        tr_time.append(tm.time()-start)\r\n                    \r\n        start=tm.time()\r\n        predicted = Classifier.predict(test)\r\n        ts_time.append(tm.time()-start)\r\n            \r\n        truelab.extend(y_test.values)\r\n        pred.extend(predicted)\r\n                \r\n    truelab = pd.DataFrame(truelab)\r\n    pred = pd.DataFrame(pred)\r\n        \r\n    tr_time = pd.DataFrame(tr_time)\r\n    ts_time = pd.DataFrame(ts_time)\r\n        \r\n    if (NumGenes == 0):  \r\n        truelab.to_csv(\"LDA_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"LDA_Pred_Labels.csv\", index = False)\r\n        tr_time.to_csv(\"LDA_Training_Time.csv\", index = False)\r\n        ts_time.to_csv(\"LDA_Testing_Time.csv\", index = False)\r\n    else:\r\n        truelab.to_csv(\"LDA_\" + str(NumGenes) + \"_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"LDA_\" + str(NumGenes) + \"_Pred_Labels.csv\", index = False)\r\n        tr_time.to_csv(\"LDA_\" + str(NumGenes) + \"_Training_Time.csv\", index = False)\r\n        ts_time.to_csv(\"LDA_\" + str(NumGenes) + \"_Testing_Time.csv\", index = False)\r\n\r\n    \r\n\r\n\r\n\r\n"
  },
  {
    "path": "Scripts/run_LDA_rejection.py",
    "content": "import os \r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\r\nimport rpy2.robjects as robjects\r\n\r\n\r\ndef run_LDA_rejection(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0, Threshold = 0.7):\r\n    '''\r\n    run baseline classifier: LDA\r\n    Wrapper script to run LDA classifier on a benchmark dataset with 5-fold cross validation,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    Threshold : Threshold used when rejecting the genes, default is 0.7.\r\n    '''\r\n        \r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n    col = col - 1 \r\n    test_ind = np.array(robjects.r['Test_Idx'])\r\n    train_ind = np.array(robjects.r['Train_Idx'])\r\n\r\n    # read the data\r\n    data = pd.read_csv(DataPath,index_col=0,sep=',')\r\n    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)\r\n    \r\n    labels = labels.iloc[tokeep]\r\n    data = data.iloc[tokeep]\r\n    \r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n    \r\n    # folder with results\r\n    os.chdir(OutputDir)\r\n    \r\n    # normalize data\r\n    data = np.log1p(data)\r\n        \r\n    Classifier = LinearDiscriminantAnalysis()\r\n            \r\n    tr_time=[]\r\n    ts_time=[]\r\n    truelab = []\r\n    pred = []\r\n        \r\n    for i in range(np.squeeze(nfolds)):\r\n        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1\r\n        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1\r\n    \r\n        train=data.iloc[train_ind_i]\r\n        test=data.iloc[test_ind_i]\r\n        y_train=labels.iloc[train_ind_i]\r\n        y_test=labels.iloc[test_ind_i]\r\n            \r\n        if (NumGenes > 0):\r\n            feat_to_use = features.iloc[0:NumGenes,i]\r\n            train = train.iloc[:,feat_to_use]\r\n            test = test.iloc[:,feat_to_use]\r\n            \r\n        start=tm.time()\r\n        Classifier.fit(train, y_train)\r\n        tr_time.append(tm.time()-start)\r\n                    \r\n        start=tm.time()\r\n        predicted = Classifier.predict(test)\r\n        prob = np.max(Classifier.predict_proba(test), axis = 1)\r\n        unlabeled = np.where(prob < Threshold)\r\n        predicted[unlabeled] = 'Unknown'\r\n        ts_time.append(tm.time()-start)\r\n            \r\n        truelab.extend(y_test.values)\r\n        pred.extend(predicted)\r\n                \r\n    truelab = pd.DataFrame(truelab)\r\n    pred = pd.DataFrame(pred)\r\n        \r\n    tr_time = pd.DataFrame(tr_time)\r\n    ts_time = pd.DataFrame(ts_time)\r\n        \r\n    if (NumGenes == 0):  \r\n        truelab.to_csv(\"LDA_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"LDA_Pred_Labels.csv\", index = False)\r\n        tr_time.to_csv(\"LDA_Training_Time.csv\", index = False)\r\n        ts_time.to_csv(\"LDA_Testing_Time.csv\", index = False)\r\n    else:\r\n        truelab.to_csv(\"LDA_\" + str(NumGenes) + \"_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"LDA_\" + str(NumGenes) + \"_Pred_Labels.csv\", index = False)\r\n        tr_time.to_csv(\"LDA_\" + str(NumGenes) + \"_Training_Time.csv\", index = False)\r\n        ts_time.to_csv(\"LDA_\" + str(NumGenes) + \"_Testing_Time.csv\", index = False)\r\n\r\n    \r\n\r\n\r\n\r\n"
  },
  {
    "path": "Scripts/run_NMC.py",
    "content": "import os \r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nfrom sklearn.neighbors import NearestCentroid\r\nimport rpy2.robjects as robjects\r\n\r\n\r\ndef run_NMC(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0):\r\n    '''\r\n    run baseline classifier: NMC\r\n    Wrapper script to run a NMC classifier on a benchmark dataset with 5-fold cross validation,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    '''\r\n        \r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n    col = col - 1 \r\n    test_ind = np.array(robjects.r['Test_Idx'])\r\n    train_ind = np.array(robjects.r['Train_Idx'])\r\n\r\n    # read the data\r\n    data = pd.read_csv(DataPath,index_col=0,sep=',')\r\n    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)\r\n    \r\n    labels = labels.iloc[tokeep]\r\n    data = data.iloc[tokeep]\r\n    \r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n    \r\n    # folder with results\r\n    os.chdir(OutputDir)\r\n    \r\n    # normalize data\r\n    data = np.log1p(data)\r\n        \r\n    Classifier = NearestCentroid()\r\n            \r\n    tr_time=[]\r\n    ts_time=[]\r\n    truelab = []\r\n    pred = []\r\n        \r\n    for i in range(np.squeeze(nfolds)):\r\n        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1\r\n        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1\r\n    \r\n        train=data.iloc[train_ind_i]\r\n        test=data.iloc[test_ind_i]\r\n        y_train=labels.iloc[train_ind_i]\r\n        y_test=labels.iloc[test_ind_i]\r\n            \r\n        if (NumGenes > 0):\r\n            feat_to_use = features.iloc[0:NumGenes,i]\r\n            train = train.iloc[:,feat_to_use]\r\n            test = test.iloc[:,feat_to_use]\r\n            \r\n        start=tm.time()\r\n        Classifier.fit(train, y_train)\r\n        tr_time.append(tm.time()-start)\r\n                    \r\n        start=tm.time()\r\n        predicted = Classifier.predict(test)\r\n        ts_time.append(tm.time()-start)\r\n            \r\n        truelab.extend(y_test.values)\r\n        pred.extend(predicted)\r\n                \r\n    truelab = pd.DataFrame(truelab)\r\n    pred = pd.DataFrame(pred)\r\n        \r\n    tr_time = pd.DataFrame(tr_time)\r\n    ts_time = pd.DataFrame(ts_time)\r\n        \r\n    if (NumGenes == 0):  \r\n        truelab.to_csv(\"NMC_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"NMC_Pred_Labels.csv\", index = False)\r\n        tr_time.to_csv(\"NMC_Training_Time.csv\", index = False)\r\n        ts_time.to_csv(\"NMC_Testing_Time.csv\", index = False)\r\n    else:\r\n        truelab.to_csv(\"NMC_\" + str(NumGenes) + \"_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"NMC_\" + str(NumGenes) + \"_Pred_Labels.csv\", index = False)\r\n        tr_time.to_csv(\"NMC_\" + str(NumGenes) + \"_Training_Time.csv\", index = False)\r\n        ts_time.to_csv(\"NMC_\" + str(NumGenes) + \"_Testing_Time.csv\", index = False)\r\n\r\n    \r\n\r\n\r\n\r\n\r\n"
  },
  {
    "path": "Scripts/run_RF.py",
    "content": "import os \r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nfrom sklearn.ensemble import RandomForestClassifier\r\nimport rpy2.robjects as robjects\r\n\r\n\r\ndef run_RF(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0):\r\n    '''\r\n    run baseline classifier: RF\r\n    Wrapper script to run a RF classifier with 50 trees on a benchmark dataset with 5-fold cross validation,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    '''\r\n        \r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n    col = col - 1 \r\n    test_ind = np.array(robjects.r['Test_Idx'])\r\n    train_ind = np.array(robjects.r['Train_Idx'])\r\n\r\n    # read the data\r\n    data = pd.read_csv(DataPath,index_col=0,sep=',')\r\n    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)\r\n    \r\n    labels = labels.iloc[tokeep]\r\n    data = data.iloc[tokeep]\r\n    \r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n    \r\n    # folder with results\r\n    os.chdir(OutputDir)\r\n    \r\n    # normalize data\r\n    data = np.log1p(data)\r\n        \r\n    Classifier = RandomForestClassifier(n_estimators = 50)\r\n            \r\n    tr_time=[]\r\n    ts_time=[]\r\n    truelab = []\r\n    pred = []\r\n        \r\n    for i in range(np.squeeze(nfolds)):\r\n        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1\r\n        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1\r\n    \r\n        train=data.iloc[train_ind_i]\r\n        test=data.iloc[test_ind_i]\r\n        y_train=labels.iloc[train_ind_i]\r\n        y_test=labels.iloc[test_ind_i]\r\n            \r\n        if (NumGenes > 0):\r\n            feat_to_use = features.iloc[0:NumGenes,i]\r\n            train = train.iloc[:,feat_to_use]\r\n            test = test.iloc[:,feat_to_use]\r\n            \r\n        start=tm.time()\r\n        Classifier.fit(train, y_train)\r\n        tr_time.append(tm.time()-start)\r\n                    \r\n        start=tm.time()\r\n        predicted = Classifier.predict(test)\r\n        ts_time.append(tm.time()-start)\r\n            \r\n        truelab.extend(y_test.values)\r\n        pred.extend(predicted)\r\n                \r\n    truelab = pd.DataFrame(truelab)\r\n    pred = pd.DataFrame(pred)\r\n        \r\n    tr_time = pd.DataFrame(tr_time)\r\n    ts_time = pd.DataFrame(ts_time)\r\n        \r\n    if (NumGenes == 0):  \r\n        truelab.to_csv(\"RF_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"RF_Pred_Labels.csv\", index = False)\r\n        tr_time.to_csv(\"RF_Training_Time.csv\", index = False)\r\n        ts_time.to_csv(\"RF_Testing_Time.csv\", index = False)\r\n    else:\r\n        truelab.to_csv(\"RF_\" + str(NumGenes) + \"_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"RF_\" + str(NumGenes) + \"_Pred_Labels.csv\", index = False)\r\n        tr_time.to_csv(\"RF_\" + str(NumGenes) + \"_Training_Time.csv\", index = False)\r\n        ts_time.to_csv(\"RF_\" + str(NumGenes) + \"_Testing_Time.csv\", index = False)\r\n\r\n    \r\n\r\n\r\n\r\n\r\n"
  },
  {
    "path": "Scripts/run_SCINA.R",
    "content": "run_SCINA<-function(DataPath,LabelsPath,GeneSigPath,OutputDir){\r\n  \"\r\n  run SCINA\r\n  Wrapper script to run SCINA on a benchmark dataset,\r\n  outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n  Parameters\r\n  ----------\r\n  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n  as row names and gene names as column names.\r\n  LabelsPath : Cell population annotations file path (.csv).\r\n  GeneSigPath : Cell type marker genes file path (.csv)\r\n  OutputDir : Output directory defining the path of the exported file.\r\n  \"\r\n  \r\n  Data <- read.csv(DataPath,row.names = 1)\r\n  Labels <- as.vector(as.matrix(read.csv(LabelsPath)))\r\n  Data <- Data[is.element(Labels,c('CD14+ Monocyte','CD19+ B','CD56+ NK')),]\r\n  Labels <- Labels[is.element(Labels,c('CD14+ Monocyte','CD19+ B','CD56+ NK'))]\r\n  Labels[Labels == 'CD14+ Monocyte'] <- 'CD14_Monocyte'\r\n  Labels[Labels == 'CD19+ B'] <- 'CD19_B'\r\n  Labels[Labels == 'CD56+ NK'] <- 'CD56_NK'\r\n  \r\n  \r\n  #############################################################################\r\n  #                                 SCINA                                     #\r\n  #############################################################################\r\n  library(SCINA)\r\n  Signature_Genes <- preprocess.signatures(GeneSigPath)\r\n  True_Labels_SCINA <- list()\r\n  Pred_Labels_SCINA <- list()\r\n  Total_Time_SCINA <- list()\r\n  \r\n  library(preprocessCore)\r\n  Data = t(as.matrix(Data))\r\n  Data=log(Data+1)\r\n  Data[]=normalize.quantiles(Data)\r\n  \r\n  start_time <- Sys.time()\r\n  results = SCINA(Data, Signature_Genes)\r\n  end_time <- Sys.time()\r\n  \r\n  True_Labels_SCINA <- Labels\r\n  Pred_Labels_SCINA <- results$cell_labels\r\n  Total_Time_SCINA <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n  \r\n  setwd(OutputDir)\r\n  \r\n  write.csv(True_Labels_SCINA,'SCINA_True_Labels.csv',row.names = FALSE)\r\n  write.csv(Pred_Labels_SCINA,'SCINA_Pred_Labels.csv',row.names = FALSE)\r\n  write.csv(Total_Time_SCINA,'SCINA_Total_Time.csv',row.names = FALSE)\r\n}\r\n"
  },
  {
    "path": "Scripts/run_SVM.py",
    "content": "import os \r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nfrom sklearn.svm import LinearSVC\r\nimport rpy2.robjects as robjects\r\n\r\n\r\ndef run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0):\r\n    '''\r\n    run baseline classifier: SVM\r\n    Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    '''\r\n        \r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n    col = col - 1 \r\n    test_ind = np.array(robjects.r['Test_Idx'])\r\n    train_ind = np.array(robjects.r['Train_Idx'])\r\n\r\n    # read the data\r\n    data = pd.read_csv(DataPath,index_col=0,sep=',')\r\n    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)\r\n    \r\n    labels = labels.iloc[tokeep]\r\n    data = data.iloc[tokeep]\r\n    \r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n    \r\n    # folder with results\r\n    os.chdir(OutputDir)\r\n    \r\n    # normalize data\r\n    data = np.log1p(data)\r\n        \r\n    Classifier = LinearSVC()\r\n            \r\n    tr_time=[]\r\n    ts_time=[]\r\n    truelab = []\r\n    pred = []\r\n        \r\n    for i in range(np.squeeze(nfolds)):\r\n        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1\r\n        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1\r\n    \r\n        train=data.iloc[train_ind_i]\r\n        test=data.iloc[test_ind_i]\r\n        y_train=labels.iloc[train_ind_i]\r\n        y_test=labels.iloc[test_ind_i]\r\n            \r\n        if (NumGenes > 0):\r\n            feat_to_use = features.iloc[0:NumGenes,i]\r\n            train = train.iloc[:,feat_to_use]\r\n            test = test.iloc[:,feat_to_use]\r\n            \r\n        start=tm.time()\r\n        Classifier.fit(train, y_train)\r\n        tr_time.append(tm.time()-start)\r\n                    \r\n        start=tm.time()\r\n        predicted = Classifier.predict(test)\r\n        ts_time.append(tm.time()-start)\r\n            \r\n        truelab.extend(y_test.values)\r\n        pred.extend(predicted)\r\n                \r\n    truelab = pd.DataFrame(truelab)\r\n    pred = pd.DataFrame(pred)\r\n        \r\n    tr_time = pd.DataFrame(tr_time)\r\n    ts_time = pd.DataFrame(ts_time)\r\n        \r\n    if (NumGenes == 0):  \r\n        truelab.to_csv(\"SVM_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"SVM_Pred_Labels.csv\", index = False)\r\n        tr_time.to_csv(\"SVM_Training_Time.csv\", index = False)\r\n        ts_time.to_csv(\"SVM_Testing_Time.csv\", index = False)\r\n    else:\r\n        truelab.to_csv(\"SVM_\" + str(NumGenes) + \"_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"SVM_\" + str(NumGenes) + \"_Pred_Labels.csv\", index = False)\r\n        tr_time.to_csv(\"SVM_\" + str(NumGenes) + \"_Training_Time.csv\", index = False)\r\n        ts_time.to_csv(\"SVM_\" + str(NumGenes) + \"_Testing_Time.csv\", index = False)\r\n\r\n    \r\n\r\n\r\n\r\n"
  },
  {
    "path": "Scripts/run_SVM_rejection.py",
    "content": "import os \r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nfrom sklearn.svm import LinearSVC\r\nimport rpy2.robjects as robjects\r\nfrom sklearn.calibration import CalibratedClassifierCV\r\n\r\n\r\ndef run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0, Threshold = 0.7):\r\n    '''\r\n    run baseline classifier: SVM\r\n    Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    Threshold : Threshold used when rejecting the cells, default is 0.7.\r\n\r\n    '''\r\n        \r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n    col = col - 1 \r\n    test_ind = np.array(robjects.r['Test_Idx'])\r\n    train_ind = np.array(robjects.r['Train_Idx'])\r\n\r\n    # read the data\r\n    data = pd.read_csv(DataPath,index_col=0,sep=',')\r\n    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)\r\n    \r\n    labels = labels.iloc[tokeep]\r\n    data = data.iloc[tokeep]\r\n    \r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n    \r\n    # folder with results\r\n    os.chdir(OutputDir)\r\n    \r\n    # normalize data\r\n    data = np.log1p(data)\r\n        \r\n    Classifier = LinearSVC()\r\n    clf = CalibratedClassifierCV(Classifier)\r\n            \r\n    tr_time=[]\r\n    ts_time=[]\r\n    truelab = []\r\n    pred = []\r\n        \r\n    for i in range(np.squeeze(nfolds)):\r\n        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1\r\n        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1\r\n    \r\n        train=data.iloc[train_ind_i]\r\n        test=data.iloc[test_ind_i]\r\n        y_train=labels.iloc[train_ind_i]\r\n        y_test=labels.iloc[test_ind_i]\r\n            \r\n        if (NumGenes > 0):\r\n            feat_to_use = features.iloc[0:NumGenes,i]\r\n            train = train.iloc[:,feat_to_use]\r\n            test = test.iloc[:,feat_to_use]\r\n            \r\n        start=tm.time()\r\n        clf.fit(train, y_train)\r\n        tr_time.append(tm.time()-start)\r\n                    \r\n        start=tm.time()\r\n        predicted = clf.predict(test)\r\n        prob = np.max(clf.predict_proba(test), axis = 1)\r\n        unlabeled = np.where(prob < Threshold)\r\n        predicted[unlabeled] = 'Unknown'\r\n        ts_time.append(tm.time()-start)\r\n            \r\n        truelab.extend(y_test.values)\r\n        pred.extend(predicted)\r\n                \r\n    truelab = pd.DataFrame(truelab)\r\n    pred = pd.DataFrame(pred)\r\n        \r\n    tr_time = pd.DataFrame(tr_time)\r\n    ts_time = pd.DataFrame(ts_time)\r\n        \r\n    if (NumGenes == 0):  \r\n        truelab.to_csv(\"SVM_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"SVM_Pred_Labels.csv\", index = False)\r\n        tr_time.to_csv(\"SVM_Training_Time.csv\", index = False)\r\n        ts_time.to_csv(\"SVM_Testing_Time.csv\", index = False)\r\n    else:\r\n        truelab.to_csv(\"SVM_\" + str(NumGenes) + \"_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"SVM_\" + str(NumGenes) + \"_Pred_Labels.csv\", index = False)\r\n        tr_time.to_csv(\"SVM_\" + str(NumGenes) + \"_Training_Time.csv\", index = False)\r\n        ts_time.to_csv(\"SVM_\" + str(NumGenes) + \"_Testing_Time.csv\", index = False)\r\n\r\n    \r\n\r\n\r\n\r\n"
  },
  {
    "path": "Scripts/run_SingleR.R",
    "content": "run_SingleR<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){\r\n  \"\r\n  run SingleR\r\n  Wrapper script to run SingleR on a benchmark dataset with 5-fold cross validation,\r\n  outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n  Parameters\r\n  ----------\r\n  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n  as row names and gene names as column names.\r\n  LabelsPath : Cell population annotations file path (.csv).\r\n  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n  OutputDir : Output directory defining the path of the exported file.\r\n  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n  defining the genes order for each cross validation fold, default is NULL.\r\n  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.\r\n  \"\r\n  \r\n  Data <- read.csv(DataPath,row.names = 1)\r\n  Labels <- as.matrix(read.csv(LabelsPath))\r\n  load(CV_RDataPath)\r\n  Labels <- as.vector(Labels[,col_Index])\r\n  Data <- Data[Cells_to_Keep,]\r\n  Labels <- Labels[Cells_to_Keep]\r\n  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n    GenesOrder = read.csv(GeneOrderPath)\r\n  }\r\n  \r\n  #############################################################################\r\n  #                               SingleR                                     #\r\n  #############################################################################\r\n  library(SingleR)\r\n  library(Seurat)\r\n  True_Labels_SingleR <- list()\r\n  Pred_Labels_SingleR <- list()\r\n  Total_Time_SingleR <- list()\r\n  Data = t(as.matrix(Data))\r\n  \r\n  for (i in c(1:n_folds)){\r\n    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n      start_time <- Sys.time()\r\n      singler = SingleR(method = \"single\", Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]], \r\n                        Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]], \r\n                        Labels[Train_Idx[[i]]], numCores = 1)\r\n      end_time <- Sys.time()\r\n    }\r\n    else{\r\n      start_time <- Sys.time()\r\n      singler = SingleR(method = \"single\", Data[,Test_Idx[[i]]], Data[,Train_Idx[[i]]], Labels[Train_Idx[[i]]], numCores = 1)\r\n      end_time <- Sys.time()\r\n    }\r\n    Total_Time_SingleR[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n    \r\n    True_Labels_SingleR[i] <- list(Labels[Test_Idx[[i]]])\r\n    Pred_Labels_SingleR[i] <- list(as.vector(singler$labels))\r\n  }\r\n  True_Labels_SingleR <- as.vector(unlist(True_Labels_SingleR))\r\n  Pred_Labels_SingleR <- as.vector(unlist(Pred_Labels_SingleR))\r\n  Total_Time_SingleR <- as.vector(unlist(Total_Time_SingleR))\r\n  \r\n  setwd(OutputDir)\r\n  \r\n  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n    write.csv(True_Labels_SingleR,paste('SingleR_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Pred_Labels_SingleR,paste('SingleR_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Total_Time_SingleR,paste('SingleR_',NumGenes,'_Total_Time.csv', sep = ''),row.names = FALSE)\r\n  }\r\n  else{\r\n    write.csv(True_Labels_SingleR,'SingleR_True_Labels.csv',row.names = FALSE)\r\n    write.csv(Pred_Labels_SingleR,'SingleR_Pred_Labels.csv',row.names = FALSE)\r\n    write.csv(Total_Time_SingleR,'SingleR_Total_Time.csv',row.names = FALSE)\r\n  }\r\n}\r\n"
  },
  {
    "path": "Scripts/run_kNN50.py",
    "content": "import os \r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nfrom sklearn.neighbors import KNeighborsClassifier\r\nimport rpy2.robjects as robjects\r\n\r\n\r\ndef run_kNN50(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0):\r\n    '''\r\n    run baseline classifiers: kNN\r\n    Wrapper script to run kNN (with k = 50) classifier on a benchmark dataset with 5-fold cross validation,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    '''\r\n        \r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n    col = col - 1 \r\n    test_ind = np.array(robjects.r['Test_Idx'])\r\n    train_ind = np.array(robjects.r['Train_Idx'])\r\n\r\n    # read the data\r\n    data = pd.read_csv(DataPath,index_col=0,sep=',')\r\n    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)\r\n    \r\n    labels = labels.iloc[tokeep]\r\n    data = data.iloc[tokeep]\r\n    \r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n    \r\n    # folder with results\r\n    os.chdir(OutputDir)\r\n    \r\n    # normalize data\r\n    data = np.log1p(data)\r\n        \r\n    Classifier = KNeighborsClassifier(n_neighbors=50)\r\n            \r\n    tr_time=[]\r\n    ts_time=[]\r\n    truelab = []\r\n    pred = []\r\n        \r\n    for i in range(np.squeeze(nfolds)):\r\n        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1\r\n        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1\r\n    \r\n        train=data.iloc[train_ind_i]\r\n        test=data.iloc[test_ind_i]\r\n        y_train=labels.iloc[train_ind_i]\r\n        y_test=labels.iloc[test_ind_i]\r\n            \r\n        if (NumGenes > 0):\r\n            feat_to_use = features.iloc[0:NumGenes,i]\r\n            train = train.iloc[:,feat_to_use]\r\n            test = test.iloc[:,feat_to_use]\r\n            \r\n        start=tm.time()\r\n        Classifier.fit(train, y_train)\r\n        tr_time.append(tm.time()-start)\r\n                    \r\n        start=tm.time()\r\n        predicted = Classifier.predict(test)\r\n        ts_time.append(tm.time()-start)\r\n            \r\n        truelab.extend(y_test.values)\r\n        pred.extend(predicted)\r\n                \r\n    truelab = pd.DataFrame(truelab)\r\n    pred = pd.DataFrame(pred)\r\n        \r\n    tr_time = pd.DataFrame(tr_time)\r\n    ts_time = pd.DataFrame(ts_time)\r\n        \r\n    if (NumGenes == 0):  \r\n        truelab.to_csv(\"kNN50_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"kNN50_Pred_Labels.csv\", index = False)\r\n        tr_time.to_csv(\"kNN50_Training_Time.csv\", index = False)\r\n        ts_time.to_csv(\"kNN50_Testing_Time.csv\", index = False)\r\n    else:\r\n        truelab.to_csv(\"kNN50_\" + str(NumGenes) + \"_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"kNN50_\" + str(NumGenes) + \"_Pred_Labels.csv\", index = False)\r\n        tr_time.to_csv(\"kNN50_\" + str(NumGenes) + \"_Training_Time.csv\", index = False)\r\n        ts_time.to_csv(\"kNN50_\" + str(NumGenes) + \"_Testing_Time.csv\", index = False)\r\n\r\n    \r\n\r\n\r\n\r\n"
  },
  {
    "path": "Scripts/run_kNN9.py",
    "content": "import os \r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nfrom sklearn.neighbors import KNeighborsClassifier\r\nimport rpy2.robjects as robjects\r\n\r\n\r\ndef run_kNN9(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0):\r\n    '''\r\n    run baseline classifiers: kNN\r\n    Wrapper script to run kNN (with k = 9) classifier on a benchmark dataset with 5-fold cross validation,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    '''\r\n        \r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n    col = col - 1 \r\n    test_ind = np.array(robjects.r['Test_Idx'])\r\n    train_ind = np.array(robjects.r['Train_Idx'])\r\n\r\n    # read the data\r\n    data = pd.read_csv(DataPath,index_col=0,sep=',')\r\n    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)\r\n    \r\n    labels = labels.iloc[tokeep]\r\n    data = data.iloc[tokeep]\r\n    \r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n    \r\n    # folder with results\r\n    os.chdir(OutputDir)\r\n    \r\n    # normalize data\r\n    data = np.log1p(data)\r\n        \r\n    Classifier = KNeighborsClassifier(n_neighbors=9)\r\n            \r\n    tr_time=[]\r\n    ts_time=[]\r\n    truelab = []\r\n    pred = []\r\n        \r\n    for i in range(np.squeeze(nfolds)):\r\n        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1\r\n        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1\r\n    \r\n        train=data.iloc[train_ind_i]\r\n        test=data.iloc[test_ind_i]\r\n        y_train=labels.iloc[train_ind_i]\r\n        y_test=labels.iloc[test_ind_i]\r\n            \r\n        if (NumGenes > 0):\r\n            feat_to_use = features.iloc[0:NumGenes,i]\r\n            train = train.iloc[:,feat_to_use]\r\n            test = test.iloc[:,feat_to_use]\r\n            \r\n        start=tm.time()\r\n        Classifier.fit(train, y_train)\r\n        tr_time.append(tm.time()-start)\r\n                    \r\n        start=tm.time()\r\n        predicted = Classifier.predict(test)\r\n        ts_time.append(tm.time()-start)\r\n            \r\n        truelab.extend(y_test.values)\r\n        pred.extend(predicted)\r\n                \r\n    truelab = pd.DataFrame(truelab)\r\n    pred = pd.DataFrame(pred)\r\n        \r\n    tr_time = pd.DataFrame(tr_time)\r\n    ts_time = pd.DataFrame(ts_time)\r\n        \r\n    if (NumGenes == 0):  \r\n        truelab.to_csv(\"kNN9_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"kNN9_Pred_Labels.csv\", index = False)\r\n        tr_time.to_csv(\"kNN9_Training_Time.csv\", index = False)\r\n        ts_time.to_csv(\"kNN9_Testing_Time.csv\", index = False)\r\n    else:\r\n        truelab.to_csv(\"kNN9_\" + str(NumGenes) + \"_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"kNN9_\" + str(NumGenes) + \"_Pred_Labels.csv\", index = False)\r\n        tr_time.to_csv(\"kNN9_\" + str(NumGenes) + \"_Training_Time.csv\", index = False)\r\n        ts_time.to_csv(\"kNN9_\" + str(NumGenes) + \"_Testing_Time.csv\", index = False)\r\n\r\n    \r\n\r\n\r\n\r\n"
  },
  {
    "path": "Scripts/run_moana.py",
    "content": "import os\r\nimport pandas as pd\r\nimport numpy as np\r\nfrom moana.core import ExpMatrix\r\nfrom moana.classify import CellTypeClassifier\r\nimport time as tm\r\nimport rpy2.robjects as robjects\r\n\r\ndef run_moana(DataPath, LabelsPath, ClassifierPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0):\r\n    '''\r\n    run moana\r\n    Wrapper script to run moana on a benchmark dataset with a pretrained classifier,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.  \r\n  \r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    ClassifierPath : Data file path to the pretrained classifier.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    '''\r\n    \r\n#    # read the Rdata file\r\n#    robjects.r['load'](CV_RDataPath)\r\n#\r\n#    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n#    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n#    col = col - 1\r\n    \r\n    matrix = ExpMatrix.read_tsv(DataPath, sep = ',')    \r\n#    matrix = matrix.iloc[tokeep] \r\n    \r\n    truelab = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',')\r\n#    truelab = truelab.iloc[tokeep]\r\n    \r\n    ct_old = ['CD19+ B','CD14+ Monocyte','CD4+/CD45RA+/CD25- Naive T','CD4+/CD45RO+ Memory','CD8+/CD45RA+ Naive Cytotoxic','Dendritic', 'CD56+ NK']\r\n    ct_new = ['B cells','CD14+ monocytes','Naive CD4+ T cells','Memory CD4+ T cells','Naive CD8+ T cells','Dendritic cells','NK cells']\r\n    \r\n    tokeep2 = np.isin(truelab,ct_old)\r\n    truelab = truelab[tokeep2]\r\n    print(len(truelab))\r\n    matrix = matrix.iloc[np.squeeze(tokeep2)]\r\n    \r\n    for i in range(len(ct_old)):\r\n        truelab.iloc[truelab == ct_old[i]] = ct_new[i]\r\n        \r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n        feat_to_use = features.iloc[0:NumGenes,0]\r\n        matrix = matrix.iloc[:,feat_to_use]\r\n\r\n    data = ExpMatrix(X = np.transpose(matrix.X), genes = matrix.cells, cells = matrix.genes)\r\n    data.genes.name = 'Genes'\r\n    data.cells.name = 'Cells'\r\n    data.index.name = 'Genes'\r\n    data.columns.name = 'Cells'\r\n    \r\n    clf = CellTypeClassifier.read_pickle(ClassifierPath)\r\n    \r\n    start = tm.time()\r\n    predictions = clf.predict(data)\r\n    runtime = tm.time() - start\r\n    \r\n    np.asarray(predictions)\r\n    \r\n    pred = pd.DataFrame(predictions)\r\n        \r\n    os.chdir(OutputDir)\r\n            \r\n    if (NumGenes == 0):  \r\n        truelab.to_csv(\"moana_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"moana_Pred_Labels.csv\", index = False)\r\n        with open(\"moana_Total_Time.csv\", 'w') as f:\r\n            f.write(\"%f\\n\" % runtime)\r\n    else:\r\n        truelab.to_csv(\"moana_\" + str(NumGenes) + \"_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"moana_\" + str(NumGenes) + \"_Pred_Labels.csv\", index = False)\r\n        with open(\"moana_\" + str(NumGenes) + \"_Total_Time.csv\", 'w') as f:\r\n            f.write(\"%f\\n\" % runtime)\r\n\r\n\r\n        \r\n    \r\n    \r\n    \r\n    \r\n    \r\n    \r\n"
  },
  {
    "path": "Scripts/run_scID.R",
    "content": "run_scID<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){\r\n  \"\r\n  run scID\r\n  Wrapper script to run scID on a benchmark dataset with 5-fold cross validation,\r\n  outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n  Parameters\r\n  ----------\r\n  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n  as row names and gene names as column names.\r\n  LabelsPath : Cell population annotations file path (.csv).\r\n  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n  OutputDir : Output directory defining the path of the exported file.\r\n  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n  defining the genes order for each cross validation fold, default is NULL.\r\n  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.\r\n  \"\r\n  \r\n  Data <- read.csv(DataPath,row.names = 1)\r\n  Labels <- as.matrix(read.csv(LabelsPath))\r\n  load(CV_RDataPath)\r\n  Labels <- as.vector(Labels[,col_Index])\r\n  Data <- Data[Cells_to_Keep,]\r\n  Labels <- Labels[Cells_to_Keep]\r\n  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n    GenesOrder = read.csv(GeneOrderPath)\r\n  }\r\n  \r\n  #############################################################################\r\n  #                                 scID                                      #\r\n  #############################################################################\r\n  library(scID)\r\n  library(Seurat)\r\n  True_Labels_scID <- list()\r\n  Pred_Labels_scID <- list()\r\n  Total_Time_scID <- list()\r\n  Data = t(as.matrix(Data))\r\n  \r\n  for (i in c(1:n_folds)){\r\n    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n      Train_Labels <- list(Labels[Train_Idx[[i]]])\r\n      names(Train_Labels[[1]]) <- colnames(Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]])\r\n      start_time <- Sys.time()\r\n      scID_output <- scid_multiclass(Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]], \r\n                                     Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]], \r\n                                     Train_Labels[[1]])\r\n      end_time <- Sys.time()\r\n    }\r\n    else{\r\n      Train_Labels <- list(Labels[Train_Idx[[i]]])\r\n      names(Train_Labels[[1]]) <- colnames(Data[,Train_Idx[[i]]])\r\n      start_time <- Sys.time()\r\n      scID_output <- scid_multiclass(Data[,Test_Idx[[i]]], Data[,Train_Idx[[i]]], Train_Labels[[1]])\r\n      end_time <- Sys.time()\r\n    }\r\n    Total_Time_scID[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n    \r\n    True_Labels_scID[i] <- list(Labels[Test_Idx[[i]]])\r\n    Pred_Labels_scID[i] <- list(as.vector(scID_output$labels))\r\n  }\r\n  True_Labels_scID <- as.vector(unlist(True_Labels_scID))\r\n  Pred_Labels_scID <- as.vector(unlist(Pred_Labels_scID))\r\n  Total_Time_scID <- as.vector(unlist(Total_Time_scID))\r\n  \r\n  setwd(OutputDir)\r\n  \r\n  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n    write.csv(True_Labels_scID,paste('scID_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Pred_Labels_scID,paste('scID_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Total_Time_scID,paste('scID_',NumGenes,'_Total_Time.csv', sep = ''),row.names = FALSE)\r\n  }\r\n  else{\r\n    write.csv(True_Labels_scID,'scID_True_Labels.csv',row.names = FALSE)\r\n    write.csv(Pred_Labels_scID,'scID_Pred_Labels.csv',row.names = FALSE)\r\n    write.csv(Total_Time_scID,'scID_Total_Time.csv',row.names = FALSE)\r\n  }\r\n}\r\n"
  },
  {
    "path": "Scripts/run_scPred.R",
    "content": "run_scPred<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){\r\n  \"\r\n  run scPred\r\n  Wrapper script to run scPred on a benchmark dataset with 5-fold cross validation,\r\n  outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n  Parameters\r\n  ----------\r\n  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n  as row names and gene names as column names.\r\n  LabelsPath : Cell population annotations file path (.csv).\r\n  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n  OutputDir : Output directory defining the path of the exported file.\r\n  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n  defining the genes order for each cross validation fold, default is NULL.\r\n  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.\r\n  \"\r\n  \r\n  Data <- read.csv(DataPath,row.names = 1)\r\n  Labels <- as.matrix(read.csv(LabelsPath))\r\n  load(CV_RDataPath)\r\n  Labels <- as.vector(Labels[,col_Index])\r\n  Data <- Data[Cells_to_Keep,]\r\n  Labels <- Labels[Cells_to_Keep]\r\n  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n    GenesOrder = read.csv(GeneOrderPath)\r\n  }\r\n  \r\n  #############################################################################\r\n  #                                scPred                                     #\r\n  #############################################################################\r\n  library(scPred)\r\n  library(tidyverse)\r\n  library(SingleCellExperiment)\r\n  True_Labels_scPred <- list()\r\n  Pred_Labels_scPred <- list()\r\n  Training_Time_scPred <- list()\r\n  Testing_Time_scPred <- list()\r\n  Data = t(as.matrix(Data))\r\n  \r\n  for (i in c(1:n_folds)){\r\n    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n      sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), \r\n                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))\r\n      sce_counts <- normcounts(sce)\r\n      sce_cpm <- apply(sce_counts, 2, function(x) (x/sum(x))*1000000)\r\n      sce_metadata <- as.data.frame(colData(sce))\r\n      \r\n      sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), \r\n                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))\r\n      sce_counts_test <- normcounts(sce_test)\r\n      sce_cpm_test <- apply(sce_counts_test, 2, function(x) (x/sum(x))*1000000)\r\n      sce_metadata_test <- as.data.frame(colData(sce_test))\r\n    }\r\n    else{\r\n      sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), \r\n                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))\r\n      sce_counts <- normcounts(sce)\r\n      sce_cpm <- apply(sce_counts, 2, function(x) (x/sum(x))*1000000)\r\n      sce_metadata <- as.data.frame(colData(sce))\r\n      \r\n      sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), \r\n                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))\r\n      sce_counts_test <- normcounts(sce_test)\r\n      sce_cpm_test <- apply(sce_counts_test, 2, function(x) (x/sum(x))*1000000)\r\n      sce_metadata_test <- as.data.frame(colData(sce_test))\r\n    }\r\n    \r\n    \r\n    # scPred Training    \r\n    start_time <- Sys.time()\r\n    set.seed(1234)\r\n    scp <- eigenDecompose(sce_cpm)\r\n    scPred::metadata(scp) <- sce_metadata\r\n    scp <- getFeatureSpace(scp, pVar = 'cell_type1')\r\n    # plotEigen(scp, group = 'cell_type1')\r\n    scp <- trainModel(scp)\r\n    # plotTrainProbs(scp)\r\n    end_time <- Sys.time()\r\n    Training_Time_scPred[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n    \r\n    # scPred Prediction\r\n    start_time <- Sys.time()\r\n    scp <- scPredict(scp,newData = sce_cpm_test)\r\n    end_time <- Sys.time()\r\n    Testing_Time_scPred[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n    \r\n    True_Labels_scPred[i] <- list(Labels[Test_Idx[[i]]])\r\n    Pred_Labels_scPred[i] <- list(getPredictions(scp)$predClass)\r\n  }\r\n  True_Labels_scPred <- as.vector(unlist(True_Labels_scPred))\r\n  Pred_Labels_scPred <- as.vector(unlist(Pred_Labels_scPred))\r\n  Training_Time_scPred <- as.vector(unlist(Training_Time_scPred))\r\n  Testing_Time_scPred <- as.vector(unlist(Testing_Time_scPred))\r\n  \r\n  setwd(OutputDir)\r\n  \r\n  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n    write.csv(True_Labels_scPred,paste('scPred_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Pred_Labels_scPred,paste('scPred_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Training_Time_scPred,paste('scPred_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Testing_Time_scPred,paste('scPred_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)\r\n  }\r\n  else{\r\n    write.csv(True_Labels_scPred,'scPred_True_Labels.csv',row.names = FALSE)\r\n    write.csv(Pred_Labels_scPred,'scPred_Pred_Labels.csv',row.names = FALSE)\r\n    write.csv(Training_Time_scPred,'scPred_Training_Time.csv',row.names = FALSE)\r\n    write.csv(Testing_Time_scPred,'scPred_Testing_Time.csv',row.names = FALSE)\r\n  }\r\n}\r\n"
  },
  {
    "path": "Scripts/run_scVI.py",
    "content": "from scvi.dataset import CsvDataset\r\nimport os\r\nimport numpy as np\r\nimport pandas as pd\r\nfrom scvi.models import SCANVI\r\nfrom scvi.inference import SemiSupervisedTrainer\r\nimport time as tm\r\nimport rpy2.robjects as robjects\r\n\r\ndef run_scVI(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0):\r\n    '''\r\n    run scVI\r\n    Wrapper script to run scVI on a benchmark dataset with 5-fold cross validation,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    '''\r\n    \r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n    col = col - 1 \r\n    test_ind = np.array(robjects.r['Test_Idx'])\r\n    train_ind = np.array(robjects.r['Train_Idx'])\r\n\r\n    # read the data\r\n    data = pd.read_csv(DataPath,index_col=0,sep=',')\r\n    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)\r\n\r\n    labels = labels.iloc[tokeep]\r\n    data = data.iloc[tokeep] \r\n    \r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n        \r\n    os.chdir(OutputDir)\r\n    \r\n    if (NumGenes == 0):\r\n        #save labels as csv file with header and index column\r\n        labels.to_csv('Labels_scvi.csv')\r\n        data.to_csv('Data_scvi.csv')    \r\n        \r\n        train = CsvDataset('Data_scvi.csv', save_path = \"\", sep = \",\", labels_file = \"Labels_scvi.csv\", gene_by_cell = False)\r\n        \r\n        ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing\r\n        scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels)\r\n        trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5)\r\n    \r\n    n_epochs = 200\r\n    \r\n    truelab = []\r\n    pred = []\r\n    tr_time = []\r\n    ts_time = []\r\n    \r\n    for i in range(np.squeeze(nfolds)):\r\n        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1\r\n        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1\r\n        \r\n        if (NumGenes > 0):\r\n            feat_to_use = features.iloc[0:NumGenes,i]\r\n            data2 = data.iloc[:,feat_to_use]\r\n            \r\n            labels.to_csv('Labels_scvi.csv')\r\n            data2.to_csv('Data_scvi.csv')    \r\n            \r\n            train = CsvDataset('Data_scvi.csv', save_path = \"\", sep = \",\", labels_file = \"Labels_scvi.csv\", gene_by_cell = False, new_n_genes = False)\r\n            \r\n            ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing\r\n            scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels)\r\n            trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5)\r\n\r\n        trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(indices=(train_ind_i).ravel(), shuffle = False)\r\n        trainer_scanvi.labelled_set.to_monitor = ['ll','accuracy']\r\n        trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(indices=(test_ind_i).ravel(), shuffle = False)\r\n        trainer_scanvi.unlabelled_set.to_monitor = ['ll','accuracy']\r\n    \r\n        start = tm.time()\r\n        trainer_scanvi.train(n_epochs)\r\n        tr_time.append(tm.time()-start)\r\n    \r\n        ## labels of test set are in y_pred\r\n        ## labels are returned in numbers, should be mapped back to the real labels\r\n        ## indices are permutated\r\n        start = tm.time()\r\n        y_true, y_pred = trainer_scanvi.unlabelled_set.compute_predictions()\r\n        ts_time.append(tm.time()-start)\r\n        \r\n        truelab.extend(y_true)\r\n        pred.extend(y_pred)\r\n    \r\n    #write results\r\n    \r\n    truelab = pd.DataFrame(truelab)\r\n    pred = pd.DataFrame(pred)\r\n    \r\n    tr_time = pd.DataFrame(tr_time)\r\n    ts_time = pd.DataFrame(ts_time)\r\n\r\n    \r\n    if (NumGenes == 0):  \r\n        truelab.to_csv(\"scVI_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"scVI_Pred_Labels.csv\", index = False)\r\n        tr_time.to_csv(\"scVI_Training_Time.csv\", index = False)\r\n        ts_time.to_csv(\"scVI_Testing_Time.csv\", index = False)\r\n    else:\r\n        truelab.to_csv(\"scVI_\" + str(NumGenes) + \"_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"scVI_\" + str(NumGenes) + \"_Pred_Labels.csv\", index = False)\r\n        tr_time.to_csv(\"scVI_\" + str(NumGenes) + \"_Training_Time.csv\", index = False)\r\n        ts_time.to_csv(\"scVI_\" + str(NumGenes) + \"_Testing_Time.csv\", index = False)\r\n        \r\n\r\n\r\n"
  },
  {
    "path": "Scripts/run_scmap.R",
    "content": "run_scmap <- function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){\r\n  \"\r\n  run scmap\r\n  Wrapper script to run scmap on a benchmark dataset with 5-fold cross validation,\r\n  outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n  Parameters\r\n  ----------\r\n  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n  as row names and gene names as column names.\r\n  LabelsPath : Cell population annotations file path (.csv).\r\n  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n  OutputDir : Output directory defining the path of the exported file.\r\n  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n  defining the genes order for each cross validation fold, default is NULL.\r\n  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.\r\n  \"\r\n  \r\n  Data <- read.csv(DataPath,row.names = 1)\r\n  Labels <- as.matrix(read.csv(LabelsPath))\r\n  load(CV_RDataPath)\r\n  Labels <- as.vector(Labels[,col_Index])\r\n  Data <- Data[Cells_to_Keep,]\r\n  Labels <- Labels[Cells_to_Keep]\r\n  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n    GenesOrder = read.csv(GeneOrderPath)\r\n  }\r\n  \r\n  #############################################################################\r\n  #                                 scmap                                     #\r\n  #############################################################################\r\n  library(scmap)\r\n  library(SingleCellExperiment)\r\n  True_Labels_scmapcluster <- list()\r\n  Pred_Labels_scmapcluster <- list()\r\n  True_Labels_scmapcell <- list()\r\n  Pred_Labels_scmapcell <- list()\r\n  Training_Time_scmapcluster <- list()\r\n  Testing_Time_scmapcluster <- list()\r\n  Training_Time_scmapcell <- list()\r\n  Testing_Time_scmapcell <- list()\r\n  Data = t(as.matrix(Data))\r\n  \r\n  for (i in c(1:n_folds)){\r\n    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n      sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), \r\n                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))\r\n      logcounts(sce) <- log2(normcounts(sce) + 1)\r\n      # use gene names as feature symbols\r\n      rowData(sce)$feature_symbol <- rownames(sce)\r\n      sce <- selectFeatures(sce, n_features = NumGenes, suppress_plot = TRUE)\r\n      \r\n      sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), \r\n                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))\r\n      logcounts(sce_test) <- log2(normcounts(sce_test) + 1)\r\n      rowData(sce_test)$feature_symbol <- rownames(sce_test)\r\n      sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData\r\n    }\r\n    else{\r\n      sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), \r\n                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))\r\n      logcounts(sce) <- log2(normcounts(sce) + 1)\r\n      # use gene names as feature symbols\r\n      rowData(sce)$feature_symbol <- rownames(sce)\r\n      sce <- selectFeatures(sce, suppress_plot = TRUE)\r\n      \r\n      sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), \r\n                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))\r\n      logcounts(sce_test) <- log2(normcounts(sce_test) + 1)\r\n      rowData(sce_test)$feature_symbol <- rownames(sce_test)\r\n      sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData\r\n    }\r\n    \r\n    # scmap-cluster\r\n    start_time <- Sys.time()\r\n    sce <- indexCluster(sce)\r\n    end_time <- Sys.time()\r\n    Training_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n    \r\n    start_time <- Sys.time()\r\n    scmapCluster_results <- scmapCluster(projection = sce_test,index_list = list(metadata(sce)$scmap_cluster_index))\r\n    end_time <- Sys.time()\r\n    Testing_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n    \r\n    True_Labels_scmapcluster[i] <- list(Labels[Test_Idx[[i]]])\r\n    Pred_Labels_scmapcluster[i] <- list(scmapCluster_results$combined_labs)\r\n    \r\n    # scmap-cell\r\n    start_time <- Sys.time()\r\n    set.seed(1)\r\n    sce <- indexCell(sce)\r\n    end_time <- Sys.time()\r\n    Training_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n    \r\n    start_time <- Sys.time()\r\n    scmapCell_results <- scmapCell(sce_test,list(metadata(sce)$scmap_cell_index))\r\n    scmapCell_clusters <- scmapCell2Cluster(scmapCell_results,list(as.character(colData(sce)$cell_type1)))\r\n    end_time <- Sys.time()\r\n    Testing_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n    \r\n    True_Labels_scmapcell[i] <- list(Labels[Test_Idx[[i]]])\r\n    Pred_Labels_scmapcell[i] <- list(scmapCell_clusters$combined_labs)\r\n  }\r\n  \r\n  True_Labels_scmapcluster <- as.vector(unlist(True_Labels_scmapcluster))\r\n  Pred_Labels_scmapcluster <- as.vector(unlist(Pred_Labels_scmapcluster))\r\n  True_Labels_scmapcell <- as.vector(unlist(True_Labels_scmapcell))\r\n  Pred_Labels_scmapcell <- as.vector(unlist(Pred_Labels_scmapcell))\r\n  Training_Time_scmapcluster <- as.vector(unlist(Training_Time_scmapcluster))\r\n  Testing_Time_scmapcluster <- as.vector(unlist(Testing_Time_scmapcluster))\r\n  Training_Time_scmapcell <- as.vector(unlist(Training_Time_scmapcell))\r\n  Testing_Time_scmapcell <- as.vector(unlist(Testing_Time_scmapcell))\r\n  \r\n  setwd(OutputDir)\r\n  \r\n  if (!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n    write.csv(True_Labels_scmapcluster,paste('scmapcluster_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Pred_Labels_scmapcluster,paste('scmapcluster_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)\r\n    write.csv(True_Labels_scmapcell,paste('scmapcell_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Pred_Labels_scmapcell,paste('scmapcell_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Training_Time_scmapcluster,paste('scmapcluster_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Testing_Time_scmapcluster,paste('scmapcluster_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Training_Time_scmapcell,paste('scmapcell_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Testing_Time_scmapcell,paste('scmapcell_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)\r\n  }\r\n  else{\r\n    write.csv(True_Labels_scmapcluster,'scmapcluster_True_Labels.csv',row.names = FALSE)\r\n    write.csv(Pred_Labels_scmapcluster,'scmapcluster_Pred_Labels.csv',row.names = FALSE)\r\n    write.csv(True_Labels_scmapcell,'scmapcell_True_Labels.csv',row.names = FALSE)\r\n    write.csv(Pred_Labels_scmapcell,'scmapcell_Pred_Labels.csv',row.names = FALSE)\r\n    write.csv(Training_Time_scmapcluster,'scmapcluster_Training_Time.csv',row.names = FALSE)\r\n    write.csv(Testing_Time_scmapcluster,'scmapcluster_Testing_Time.csv',row.names = FALSE)\r\n    write.csv(Training_Time_scmapcell,'scmapcell_Training_Time.csv',row.names = FALSE)\r\n    write.csv(Testing_Time_scmapcell,'scmapcell_Testing_Time.csv',row.names = FALSE)\r\n  }\r\n}\r\n"
  },
  {
    "path": "Scripts/run_singleCellNet.R",
    "content": "run_singleCellNet<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){\r\n  \"\r\n  run singleCellNet\r\n  Wrapper script to run singleCellNet on a benchmark dataset with 5-fold cross validation,\r\n  outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n  Parameters\r\n  ----------\r\n  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n  as row names and gene names as column names.\r\n  LabelsPath : Cell population annotations file path (.csv).\r\n  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n  OutputDir : Output directory defining the path of the exported file.\r\n  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n  defining the genes order for each cross validation fold, default is NULL.\r\n  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.\r\n  \"\r\n  \r\n  Data <- read.csv(DataPath,row.names = 1)\r\n  colnames(Data) <- gsub('_','.',colnames(Data), fixed = TRUE)\r\n  Labels <- as.matrix(read.csv(LabelsPath))\r\n  load(CV_RDataPath)\r\n  Labels <- as.vector(Labels[,col_Index])\r\n  Data <- Data[Cells_to_Keep,]\r\n  Labels <- Labels[Cells_to_Keep]\r\n  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n    GenesOrder = read.csv(GeneOrderPath)\r\n  }\r\n  \r\n  #############################################################################\r\n  #                              singleCellNet                                #\r\n  #############################################################################\r\n  library(singleCellNet)\r\n  library(dplyr)\r\n  True_Labels_singleCellNet <- list()\r\n  Pred_Labels_singleCellNet <- list()\r\n  Training_Time_singleCellNet <- list()\r\n  Testing_Time_singleCellNet <- list()\r\n  Data = t(as.matrix(Data))              # deals also with sparse matrix\r\n  \r\n  for(i in c(1:n_folds)){\r\n    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n      DataTrain <- Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]\r\n      DataTest <- Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]\r\n    }\r\n    else{\r\n      DataTrain <- Data[,Train_Idx[[i]]]\r\n      DataTest <- Data[,Test_Idx[[i]]]\r\n    }\r\n    \r\n    start_time <- Sys.time()\r\n    cgenes2<-findClassyGenes(DataTrain, data.frame(Annotation = Labels[Train_Idx[[i]]]), \"Annotation\")\r\n    cgenesA<-cgenes2[['cgenes']]\r\n    grps<-cgenes2[['grps']]\r\n    DataTrain<-as.matrix(DataTrain[cgenesA,])\r\n    xpairs<-ptGetTop(DataTrain, grps, ncores = 1)\r\n    pdTrain<-query_transform(DataTrain[cgenesA, ], xpairs)\r\n    rf<-sc_makeClassifier(pdTrain[xpairs,], genes=xpairs, groups=grps)\r\n    end_time <- Sys.time()\r\n    Training_Time_singleCellNet[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n    \r\n    start_time <- Sys.time()\r\n    DataTest<-query_transform(DataTest[cgenesA,], xpairs)\r\n    classRes <-rf_classPredict(rf, DataTest)\r\n    end_time <- Sys.time()\r\n    Testing_Time_singleCellNet[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n    \r\n    True_Labels_singleCellNet[i] <- list(Labels[Test_Idx[[i]]])\r\n    Pred_Labels_singleCellNet[i] <- list((rownames(classRes)[apply(classRes,2,which.max)])[1:length(Test_Idx[[i]])])\r\n  }\r\n  True_Labels_singleCellNet <- as.vector(unlist(True_Labels_singleCellNet))\r\n  Pred_Labels_singleCellNet <- as.vector(unlist(Pred_Labels_singleCellNet))\r\n  Training_Time_singleCellNet <- as.vector(unlist(Training_Time_singleCellNet))\r\n  Testing_Time_singleCellNet <- as.vector(unlist(Testing_Time_singleCellNet))\r\n  \r\n  setwd(OutputDir)\r\n  \r\n  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n    write.csv(True_Labels_singleCellNet,paste('singleCellNet_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Pred_Labels_singleCellNet,paste('singleCellNet_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Training_Time_singleCellNet,paste('singleCellNet_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Testing_Time_singleCellNet,paste('singleCellNet_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)\r\n  }\r\n  else{\r\n    write.csv(True_Labels_singleCellNet,'singleCellNet_True_Labels.csv',row.names = FALSE)\r\n    write.csv(Pred_Labels_singleCellNet,'singleCellNet_Pred_Labels.csv',row.names = FALSE)\r\n    write.csv(Training_Time_singleCellNet,'singleCellNet_Training_Time.csv',row.names = FALSE)\r\n    write.csv(Testing_Time_singleCellNet,'singleCellNet_Testing_Time.csv',row.names = FALSE)\r\n  }\r\n}\r\n"
  },
  {
    "path": "Snakemake/Cross_Validation.R",
    "content": "args <- commandArgs(TRUE)\r\n\r\nCross_Validation <- function(LabelsPath, col_Index = 1, OutputDir){\r\n  \"\r\n  Cross_Validation\r\n  Function returns train and test indices for 5 folds stratified across unique cell populations,\r\n  also filter out cell populations with less than 10 cells.\r\n  It return a 'CV_folds.RData' file which then used as input to classifiers wrappers.\r\n\r\n  Parameters\r\n  ----------\r\n  LabelsPath : Cell population annotations file path (.csv).\r\n  col_Index : column index (integer) defining which level of annotation to use,\r\n  in case of multiple cell type annotations (default is 1)\r\n  OutputDir : Output directory defining the path of the exported file.\r\n  \"\r\n\r\n  Labels <- as.matrix(read.csv(LabelsPath))\r\n  Labels <- as.vector(Labels[,col_Index])\r\n\r\n  Removed_classes <- !(table(Labels) > 10)\r\n  Cells_to_Keep <- !(is.element(Labels,names(Removed_classes)[Removed_classes]))\r\n  Labels <- Labels[Cells_to_Keep]\r\n\r\n  # Getting training and testing Folds\r\n  library(rBayesianOptimization)\r\n  n_folds = 5\r\n  Folds <- KFold(Labels,nfolds = n_folds, stratified = TRUE)\r\n  Test_Folds <- c(n_folds:1)\r\n  Train_Idx <- list()\r\n  Test_Idx <- list()\r\n  for (i in c(1:length(Folds))){\r\n    Temp_Folds <- Folds\r\n    Temp_Folds[Test_Folds[i]] <- NULL\r\n    Train_Idx[i] <- list(unlist(Temp_Folds))\r\n    Test_Idx[i] <- Folds[Test_Folds[i]]\r\n  }\r\n  remove(Temp_Folds,i,Folds)\r\n  save(n_folds,Train_Idx,Test_Idx,col_Index,Cells_to_Keep,file = paste0(OutputDir, '/CV_folds.RData'))\r\n}\r\n\r\nCross_Validation(args[1], as.numeric(args[2]), args[3])\r\n"
  },
  {
    "path": "Snakemake/DEgenesMAST.R",
    "content": "DEgenesMAST <- function(Data, Labels, Normalize = FALSE, LogTransform = FALSE){\r\n  # This functions applies a differential expression test to the data using one vs all\r\n  # The training data should be used a an input\r\n  # The output is a matrix with marker genes where the columns are the cell populations and the rows are the top20 marker genes\r\n  # This output can be rewritten to the format of the prior-knowledge-supervised classifiers and afterwards be used to classify the test set.\r\n  \r\n  # Data: genes X cells (rows = genes, columns = cells)\r\n  # Labels: labels of the data\r\n  # Normalize: the input for MAST should be cpm normalized data, \r\n  #            if the data is not normalized yet, this should be set to TRUE\r\n  # LogTransform: the input for MAST should be logtransformed,\r\n  #            if the data is not logtransformed yet, this should be set to TRUE\r\n  \r\n  \r\n  library(Seurat)\r\n  \r\n  if(Normalize)\r\n  {\r\n    Data <- apply(Data, 2, function(x) (x/sum(x))*1000000)\r\n  }\r\n  \r\n  if(LogTransform)\r\n  {\r\n    Data <- log(Data+1, base = 2)\r\n  }\r\n  SeuObj <- CreateSeuratObject(raw.data = Data, project = \"DEgenes\")\r\n  SeuObj <- SetIdent(SeuObj, ident.use = Labels)\r\n  DEgenes <- FindAllMarkers(SeuObj, test.use = \"MAST\")\r\n  Markers <- matrix(nrow = 20,ncol = length(unique(Labels)))\r\n  colnames(Markers) <- unique(Labels)\r\n  for (i in unique(Labels)){\r\n    i\r\n    TempList <- DEgenes$gene[((DEgenes$cluster == i) & (DEgenes$avg_logFC > 0))]\r\n    MarkerGenes <- DEgenes$p_val_adj[DEgenes$cluster == i]\r\n    print(MarkerGenes[1:20])\r\n    if (length(TempList) >= 20){\r\n      Markers[,i] <- TempList[1:20]\r\n    }\r\n    else{\r\n      if(length(TempList) > 0){\r\n        Markers[c(1:length(TempList)),i] <- TempList\r\n      }\r\n    }\r\n  }\r\n  return(Markers)\r\n}\r\n"
  },
  {
    "path": "Snakemake/Dockerfiles/baseline/Dockerfile",
    "content": "FROM debian:9.9-slim\n\n# Install newest R version\nRUN apt-get update && \\\n    apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \\\n    wget -qO - http://keys.gnupg.net/pks/lookup?op=get\\&search=0xAD5F960A256A04AF | apt-key add - && \\\n    echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \\\n    apt-get update && \\\n    apt-get install --no-install-recommends --yes r-base && \\\n    apt-get purge --yes wget gnupg apt-transport-https && \\\n    apt-get autoremove --yes && \\\n    apt-get clean && \\\n    rm -rf /var/lib/apt/lists/*\n\n# Install python\nRUN apt-get update && \\\n    apt-get install --no-install-recommends --yes python3 python3-pip && \\\n    pip3 --no-cache-dir install setuptools && \\\n    pip3 --no-cache-dir install pandas rpy2 scikit-learn statsmodels && \\\n    rm -rf /var/lib/apt/lists/*\n\nCOPY Scripts/run_kNN50.py \\\n     Scripts/run_kNN9.py \\\n     Scripts/run_LDA.py \\\n     Scripts/run_LDA_rejection.py \\\n     Scripts/run_NMC.py \\\n     Scripts/run_RF.py \\\n     Scripts/run_SVM.py \\\n     Scripts/run_SVM_rejection.py \\\n     rank_gene_dropouts.py \\\n     /Scripts/\n"
  },
  {
    "path": "Snakemake/Dockerfiles/cell_blast/Dockerfile",
    "content": "FROM python:3.7-slim-stretch\n\n# Install newest R version\nRUN apt-get update && \\\n    apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \\\n    wget -qO - http://keys.gnupg.net/pks/lookup?op=get\\&search=0xAD5F960A256A04AF | apt-key add - && \\\n    echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \\\n    apt-get update && \\\n    apt-get install --no-install-recommends --yes r-base && \\\n    apt-get purge --yes wget gnupg apt-transport-https && \\\n    apt-get autoremove --yes && \\\n    apt-get clean && \\\n    rm -rf /var/lib/apt/lists/*\n\n# Install python and pip deps\nRUN apt-get update && apt-get install --no-install-recommends --yes build-essential python3-dev libxml2 libxml2-dev zlib1g-dev && \\\n    pip3 --no-cache-dir install --upgrade pip && \\\n    pip3 --no-cache-dir install --upgrade setuptools && \\\n    pip3 --no-cache-dir install pandas rpy2 scikit-learn statsmodels tensorflow Cell-BLAST && \\\n    apt-get autoremove --yes && \\\n    apt-get clean && \\\n    rm -rf /var/lib/apt/lists/*\n\nCOPY Scripts/run_Cell_BLAST.py /Scripts/\n"
  },
  {
    "path": "Snakemake/Dockerfiles/chetah/Dockerfile",
    "content": "FROM debian:9.9-slim\n\n# Install newest R version\nRUN apt-get update && \\\n    apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \\\n    wget -qO - http://keys.gnupg.net/pks/lookup?op=get\\&search=0xAD5F960A256A04AF | apt-key add - && \\\n    echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \\\n    apt-get update && \\\n    apt-get install --no-install-recommends --yes r-base && \\\n    apt-get purge --yes wget gnupg apt-transport-https && \\\n    apt-get autoremove --yes && \\\n    apt-get clean && \\\n    rm -rf /var/lib/apt/lists/*\n\nCOPY Scripts/run_CHETAH.R \\\n     Dockerfiles/chetah/install_packages.R \\\n     /Scripts/\n\n# Install R packages\nRUN apt-get update && \\\n    apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \\\n    Rscript --vanilla /Scripts/install_packages.R && \\\n    apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \\\n    apt-get autoremove --yes && \\\n    apt-get clean && \\\n    rm -rf /var/lib/apt/lists/*\n"
  },
  {
    "path": "Snakemake/Dockerfiles/chetah/install_packages.R",
    "content": "withCallingHandlers({\n  install.packages(\"devtools\", repos=\"https://cloud.r-project.org/\")\n  install.packages(\"BiocManager\", repos=\"https://cloud.r-project.org/\")\n  BiocManager::install(c(\"bioDist\", \"ggplot2\", \"gplots\", \"cowplot\",\n                         \"dendextend\", \"corrplot\", \"reshape2\", \"plotly\"))\n  devtools::install_github(\"jdekanter/CHETAH\", ref=\"b777e6f671bff3c434842adb655869a52bc9e368\")\n},\nwarning = function(w) stop(w))\n"
  },
  {
    "path": "Snakemake/Dockerfiles/cross_validation/Dockerfile",
    "content": "FROM debian:9.9-slim\n\n# Install newest R version\nRUN apt-get update && \\\n    apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \\\n    wget -qO - http://keys.gnupg.net/pks/lookup?op=get\\&search=0xAD5F960A256A04AF | apt-key add - && \\\n    echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \\\n    apt-get update && \\\n    apt-get install --no-install-recommends --yes r-base && \\\n    apt-get purge --yes wget gnupg apt-transport-https && \\\n    apt-get autoremove --yes && \\\n    apt-get clean && \\\n    rm -rf /var/lib/apt/lists/*\n\nCOPY Cross_Validation.R \\\n     Dockerfiles/cross_validation/install_packages.R \\\n     /Scripts/\n\n# Install R packages\nRUN apt-get update && \\\n    apt-get install --no-install-recommends --yes make gcc libc6-dev g++ libxml2-dev && \\\n    Rscript --vanilla /Scripts/install_packages.R && \\\n    apt-get purge --yes make gcc g++ libxml2-dev && \\\n    apt-get autoremove --yes && \\\n    apt-get clean && \\\n    rm -rf /var/lib/apt/lists/*\n"
  },
  {
    "path": "Snakemake/Dockerfiles/cross_validation/install_packages.R",
    "content": "withCallingHandlers({\n  install.packages(\"lhs\", repos=\"https://cloud.r-project.org/\")\n  install.packages(\"rBayesianOptimization\", repos=\"https://cloud.r-project.org/\")\n},\nwarning = function(w) stop(w))\n"
  },
  {
    "path": "Snakemake/Dockerfiles/garnett/Dockerfile",
    "content": "FROM debian:9.9-slim\n\n# Install newest R version\nRUN apt-get update && \\\n    apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \\\n    wget -qO - http://keys.gnupg.net/pks/lookup?op=get\\&search=0xAD5F960A256A04AF | apt-key add - && \\\n    echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \\\n    apt-get update && \\\n    apt-get install --no-install-recommends --yes r-base && \\\n    apt-get purge --yes wget gnupg apt-transport-https && \\\n    apt-get autoremove --yes && \\\n    apt-get clean && \\\n    rm -rf /var/lib/apt/lists/*\n\nCOPY Scripts/run_Garnett_CV.R \\\n     Scripts/run_Garnett_Pretrained.R \\\n     Dockerfiles/garnett/install_packages.R \\\n     /Scripts/\n\n# Install R packages\nRUN apt-get update && \\\n    apt-get install --no-install-recommends --yes make gcc g++ libxml2-dev zlib1g-dev gfortran liblapack-dev libcurl4-gnutls-dev libssl-dev && \\\n    Rscript --vanilla /Scripts/install_packages.R && \\\n    apt-get purge --yes make gcc g++ zlib1g-dev gfortran liblapack-dev libcurl4-gnutls-dev libssl-dev && \\\n    apt-get autoremove --yes && \\\n    apt-get clean && \\\n    rm -rf /var/lib/apt/lists/*\n"
  },
  {
    "path": "Snakemake/Dockerfiles/garnett/install_packages.R",
    "content": "withCallingHandlers({\n  install.packages(\"BiocManager\", repos=\"https://cloud.r-project.org/\")\n  BiocManager::install(c(\"monocle\", \"DelayedArray\", \"DelayedMatrixStats\",\n                       \"org.Hs.eg.db\", \"org.Mm.eg.db\"))\n  install.packages(\"devtools\", repos=\"https://cloud.r-project.org/\")\n  devtools::install_github(\"cole-trapnell-lab/garnett\", ref=\"9804b532bbcc1714b3ed0b718cf430741f1dba6c\")\n},\nwarning = function(w) stop(w))\n"
  },
  {
    "path": "Snakemake/Dockerfiles/scid/Dockerfile",
    "content": "FROM r-base:3.6.0\n\nCOPY Scripts/run_scID.R \\\n     Dockerfiles/scid/install_packages.R \\\n     /Scripts/\n\n# Install R packages\nRUN apt-get update && \\\n    apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \\\n    Rscript --vanilla /Scripts/install_packages.R && \\\n    apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \\\n    apt-get autoremove --yes && \\\n    apt-get clean && \\\n    rm -rf /var/lib/apt/lists/*\n"
  },
  {
    "path": "Snakemake/Dockerfiles/scid/install_packages.R",
    "content": "withCallingHandlers({\n  install.packages(\"BiocManager\", repos=\"https://cloud.r-project.org/\")\n  BiocManager::install(ask = FALSE);\n  BiocManager::install(c(\"scater\", \"MAST\"))\n  install.packages(\"devtools\", repos=\"https://cloud.r-project.org/\")\n  devtools::install_github(\"satijalab/seurat\")\n  devtools::install_github(\"BatadaLab/scID\")\n},\nwarning = function(w) stop(w))\n"
  },
  {
    "path": "Snakemake/Dockerfiles/scmap/Dockerfile",
    "content": "FROM r-base:3.6.0\n\nCOPY Scripts/run_scmapcell.R \\\n     Scripts/run_scmapcluster.R \\\n     Dockerfiles/scmap/install_packages.R \\\n     /Scripts/\n\n# Install R packages\nRUN apt-get update && \\\n    apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \\\n    Rscript --vanilla /Scripts/install_packages.R && \\\n    apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \\\n    apt-get autoremove --yes && \\\n    apt-get clean && \\\n    rm -rf /var/lib/apt/lists/*\n"
  },
  {
    "path": "Snakemake/Dockerfiles/scmap/install_packages.R",
    "content": "withCallingHandlers({\n  install.packages(\"BiocManager\", repos=\"https://cloud.r-project.org/\")\n  BiocManager::install(ask = FALSE)\n  BiocManager::install(\"SingleCellExperiment\")\n  install.packages(\"devtools\", repos=\"https://cloud.r-project.org/\")\n  devtools::install_github(\"hemberg-lab/scmap\")\n},\nwarning = function(w) stop(w))\n"
  },
  {
    "path": "Snakemake/Dockerfiles/scvi/Dockerfile",
    "content": "FROM python:3.7-slim-stretch\n\n# Install newest R version\nRUN apt-get update && \\\n    apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \\\n    wget -qO - http://keys.gnupg.net/pks/lookup?op=get\\&search=0xAD5F960A256A04AF | apt-key add - && \\\n    echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \\\n    apt-get update && \\\n    apt-get install --no-install-recommends --yes r-base && \\\n    apt-get purge --yes wget gnupg apt-transport-https && \\\n    apt-get autoremove --yes && \\\n    apt-get clean && \\\n    rm -rf /var/lib/apt/lists/*\n\n# Install python and pip deps\nRUN apt-get update && apt-get install --no-install-recommends --yes build-essential python3-dev libxml2 libxml2-dev zlib1g-dev && \\\n    pip3 --no-cache-dir install --upgrade pip && \\\n    pip3 --no-cache-dir install --upgrade setuptools && \\\n    pip3 --no-cache-dir install pandas rpy2 scikit-learn statsmodels tensorflow scvi && \\\n    apt-get autoremove --yes && \\\n    apt-get clean && \\\n    rm -rf /var/lib/apt/lists/*\n\n\nCOPY Scripts/run_scVI.py /Scripts/\n"
  },
  {
    "path": "Snakemake/Dockerfiles/singlecellnet/Dockerfile",
    "content": "FROM debian:9.9-slim\n\n# Install newest R version\nRUN apt-get update && \\\n    apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \\\n    wget -qO - http://keys.gnupg.net/pks/lookup?op=get\\&search=0xAD5F960A256A04AF | apt-key add - && \\\n    echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \\\n    apt-get update && \\\n    apt-get install --no-install-recommends --yes r-base && \\\n    apt-get purge --yes wget gnupg apt-transport-https && \\\n    apt-get autoremove --yes && \\\n    apt-get clean && \\\n    rm -rf /var/lib/apt/lists/*\n\nCOPY Scripts/run_singleCellNet.R \\\n     Dockerfiles/singlecellnet/install_packages.R \\\n     /Scripts/\n\n# Install R packages\nRUN apt-get update && \\\n    apt-get install --no-install-recommends --yes make gcc libc6-dev g++ libcurl4-openssl-dev zlib1g-dev libssl-dev r-base-dev libxml2-dev && \\\n    Rscript --vanilla /Scripts/install_packages.R && \\\n    apt-get purge --yes make gcc g++ zlib1g-dev libcurl4-openssl-dev libc6-dev libssl-dev r-base-dev libxml2-dev && \\\n    apt-get autoremove --yes && \\\n    apt-get clean && \\\n    rm -rf /var/lib/apt/lists/*\n"
  },
  {
    "path": "Snakemake/Dockerfiles/singlecellnet/install_packages.R",
    "content": "withCallingHandlers({\n  install.packages(\"devtools\", repos=\"https://cloud.r-project.org/\")\n  install.packages(\"BiocManager\", repos=\"https://cloud.r-project.org/\")\n  BiocManager::install(\"fgsea\")\n  devtools::install_github(\"thomasp85/patchwork\", ref=\"fd7958bae3e7a1e30237c751952e412a0a1d1242\")\n  devtools::install_github(\"pcahan1/singleCellNet\", ref=\"4279a68112743b783cc82628421dd703261ec117\")\n},\nwarning = function(w) stop(w))\n"
  },
  {
    "path": "Snakemake/Dockerfiles/singler/Dockerfile",
    "content": "FROM debian:9.9-slim\n\n# Install newest R version\nRUN apt-get update && \\\n    apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \\\n    wget -qO - http://keys.gnupg.net/pks/lookup?op=get\\&search=0xAD5F960A256A04AF | apt-key add - && \\\n    echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \\\n    apt-get update && \\\n    apt-get install --no-install-recommends --yes r-base && \\\n    apt-get purge --yes wget gnupg apt-transport-https && \\\n    apt-get autoremove --yes && \\\n    apt-get clean && \\\n    rm -rf /var/lib/apt/lists/*\n\nCOPY Scripts/run_SingleR.R \\\n     Dockerfiles/singler/install_packages.R \\\n     /Scripts/\n\nRUN apt-get update && \\\n    apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev libxml2 && \\\n    Rscript --vanilla /Scripts/install_packages.R && \\\n    apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \\\n    apt-get autoremove --yes && \\\n    apt-get clean && \\\n    rm -rf /var/lib/apt/lists/*\n"
  },
  {
    "path": "Snakemake/Dockerfiles/singler/install_packages.R",
    "content": "withCallingHandlers({\n  install.packages(\"devtools\", repos=\"https://cloud.r-project.org/\")\n  install.packages(\"Seurat\", repos=\"https://cloud.r-project.org/\")\n  devtools::install_github(\"dviraran/SingleR\", ref=\"db4823b380ba2c3142c857c8c0695200dd1736f6\")\n},\nwarning = function(w) stop(w))\n"
  },
  {
    "path": "Snakemake/LICENSE",
    "content": "MIT License\n\nCopyright (c) 2019 tabdelaal\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "Snakemake/README.md",
    "content": "# scRNAseq_Benchmark\nBenchmarking classification tools for scRNA-seq data\n\n## How to use\n[snakemake](https://snakemake.readthedocs.io/en/stable/index.html) and\n[singularity](https://www.sylabs.io/docs/) need to be available on your \nsystem. You will need to run this on a linux system, as singularity\nonly supports linux.\n\nFrom the root of this repository:\n```\nsnakemake \\\n  --configfile <configfile> \\\n  --use-singularity\n```\n\nIf your data or output directory is not located under the root of this\nrepository, be sure to tell snakemake to mount the appropriate directories\nin singularity:\n```\nsnakemake \\\n  --configfile <configfile> \\\n  --use-singularity \\\n  --singularity-args '--bind <location of inputs>:<location of inputs> --bind <output directory>:<output directory>'\n```\n\n#### The config file\n```YML\noutput_dir: <path to outputs directory>\ndatafile: <path to csv file with counts per cell>\nlabfile: <csv with true labels per cell>\ncolumn: <The index of the column in the labels file which ought to be used, defaults to 1>\nnumber_of_features: <number of features to be used as input for the classification methods, 0 means all, defaults to 0>\ngenes: <path to gene name list, only needed for garnett_CV and Garnett_Pretrained>\nhuman: <whether or not the data is human, true means human, false means mouse, defaults to true>\ntools_to_run: # List of tools to run\n  - <tool 1>\n  - <tool 2>\n  - <...>\n```\n\n##### Tool specific inputs\nSome tools require specific inputs. Add the following to your config file when\none of these tools:\n- Garnett_CV\n  ```YML\n  Garnett_CV:\n    markers: <path to Gernett marker gene file>\n  ```\n- Garnett_Pretrained\n  ```YML\n  Garnett_Pretrained:\n    classifier: <path to Gernett classifier>\n  ```\n\n<!-- TODO explain these input files -->\n\n## Included tools/methods\n- kNN50\n- kNN9\n- LDA\n- LDA_rejection (LDA with rejection option)\n- NMC\n- RF\n- SVM\n- SVM (SVM with rejection option)\n- [singleCellNet](https://github.com/pcahan1/singleCellNet)\n- [CHETAH](https://github.com/jdekanter/CHETAH)\n- [scmap](https://github.com/hemberg-lab/scmap)\n  - scmapcell\n  - scmapcluster\n- [SingleR](https://github.com/dviraran/SingleR)\n- [scID](https://github.com/BatadaLab/scID)\n- [scVI](https://github.com/YosefLab/scVI)\n- [Cell_BLAST](https://github.com/gao-lab/Cell_BLAST)\n- [Garnett](https://cole-trapnell-lab.github.io/garnett/)\n  - Garnett_CV (without pretrained classifier)\n  - Garnett_Pretrained (with pretrained classifier)\n\n## Adding new tools\nIn order to add a tool to this benchmarking workflow, a rule for this tool\nneeds to be added to the `Snakefile`. This rule should produce as output:\n- a table of predicted label (`<output directory/<tool>/<tool>_pred.csv`).\n- a table of true labels (`<output directory/<tool>/<tool>_true.csv`).\n- a tables of testing, prediction and/or total time:\n  - `<output directory>/<tool>/<tool>_test_time.csv`\n  - `<output directory>/<tool>/<tool>_training_time.csv`\n  - `<output directory>/<tool>/<tool>_total_time.csv`\n\nThe input to this rule should be:\n- a count table (specified as the `datafile` in the config).\n- a true labels file (specified as the `labfile` in the config).\n\nYou will want to write a wrapper script for the tool you want to\nadd to facilitate this. The `\"{output_dir}/CV_folds.RData\"` input may be\nused to provide your wrapper script with folds for cross_validation.\nIt is recommended to make a docker image containing all dependencies for both\nthe tool and any wrappers for the tool.  \nThis wrapper script should also make a selection of the features to be used.\nThis selection should be based on ranking which can be accessed by providing\n`feature ranking` as input to the wrapper script. The number of features to be\nused should be configurable and settable through the 'number_of_features' field\nin the config.\n\nThe following can be used as a template for new rules. Replace everything\nsurrounded by (and including the) `<>` with appropriate values.\n```\nrule SVM:\n  input:\n    datafile = config[\"datafile\"],\n    labfile = config[\"labfile\"],\n    folds = \"{output_dir}/CV_folds.RData\",\n    ranking = feature_ranking\n  output:\n    pred = \"{output_dir}/<tool name>/<tool name>_pred.csv\",\n    true = \"{output_dir}/<tool name>/<tool name>_true.csv\",\n    test_time = \"{output_dir}/<tool name>/<tool name>_test_time.csv\",\n    training_time = \"{output_dir}/<tool name>/<tool name>_training_time.csv\"\n  log: \"{output_dir}/<tool name>/<tool name>.log\"\n  params:\n    n_features = config.get(\"number_of_features\", 0)\n  singularity: \"docker://<docker image>\"\n  shell:\n    \"<python or Rscript> <wrapper script> \"\n    \"{input.datafile} \"\n    \"{input.labfile} \"\n    \"{input.folds} \"\n    \"{wildcards.output_dir}/<tool name> \"\n    \"{input.ranking} \"\n    \"{params.n_features} \"\n    \"&> {log}\"\n```\n"
  },
  {
    "path": "Snakemake/Scripts/run_ACTINN.py",
    "content": "import os \r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nimport rpy2.robjects as robjects\r\n\r\ndef run_ACTINN(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0):\r\n    '''\r\n    run ACTINN\r\n    Wrapper script to run ACTINN on a benchmark dataset with 5-fold cross validation,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    '''\r\n    \r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n    col = col - 1 \r\n    test_ind = np.array(robjects.r['Test_Idx'])\r\n    train_ind = np.array(robjects.r['Train_Idx'])\r\n\r\n    # read the data\r\n    data = pd.read_csv(DataPath,index_col=0,sep=',')\r\n    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)\r\n    \r\n    labels = labels.iloc[tokeep]\r\n    data = data.iloc[tokeep]\r\n    \r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n    \r\n    # folder with results\r\n    os.chdir(OutputDir)\r\n    \r\n    tot=[]\r\n    truelab = []\r\n    pred = []\r\n\r\n    for i in range(np.squeeze(nfolds)):\r\n        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1\r\n        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1\r\n    \r\n        train=data.iloc[train_ind_i]\r\n        test=data.iloc[test_ind_i]\r\n        y_train=labels.iloc[train_ind_i]\r\n        y_test=labels.iloc[test_ind_i]\r\n        \r\n        if (NumGenes > 0):\r\n            feat_to_use = features.iloc[0:NumGenes,i]\r\n            train = train.iloc[:,feat_to_use]\r\n            test = test.iloc[:,feat_to_use]\r\n        \r\n        train = train.transpose()\r\n        test = test.transpose()\r\n        \r\n        train.to_csv(\"train.csv\")\r\n        test.to_csv(\"test.csv\")\r\n        y_train.to_csv(\"train_lab.csv\", header = False, index = True, sep = '\\t')\r\n        y_test.to_csv(\"test_lab.csv\", header = False, index = True, sep = '\\t')\r\n        \r\n        tm.sleep(60)\r\n            \r\n        os.system(\"python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_format.py -i train.csv -o train -f csv\")\r\n        os.system(\"python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_format.py -i test.csv -o test -f csv\")\r\n        \r\n        start = tm.time()\r\n        os.system(\"python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_predict.py -trs train.h5 -trl train_lab.csv -ts test.h5\")    \r\n        tot.append(tm.time()-start)\r\n        \r\n        tm.sleep(60)\r\n\r\n        truelab.extend(y_test.values)\r\n        predlabels = pd.read_csv('predicted_label.txt',header=0,index_col=None, sep='\\t', usecols = [1])            \r\n        pred.extend(predlabels.values)\r\n    \r\n            \r\n    truelab = pd.DataFrame(truelab)\r\n    pred = pd.DataFrame(pred)\r\n    tot_time = pd.DataFrame(tot)\r\n    \r\n    if (NumGenes == 0):  \r\n        truelab.to_csv(\"ACTINN_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"ACTINN_Pred_Labels.csv\", index = False)\r\n        tot_time.to_csv(\"ACTINN_Total_Time.csv\", index = False)\r\n    else:\r\n        truelab.to_csv(\"ACTINN_\" + str(NumGenes) + \"_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"ACTINN_\" + str(NumGenes) + \"_Pred_Labels.csv\", index = False)\r\n        tot_time.to_csv(\"ACTINN_\" + str(NumGenes) + \"_Total_Time.csv\", index = False)\r\n        \r\n        \r\n        \r\n        \r\n        \r\n        \r\n        \r\n"
  },
  {
    "path": "Snakemake/Scripts/run_CHETAH.R",
    "content": "args <- commandArgs(TRUE)\r\n\r\nrun_CHETAH<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){\r\n  \"\r\n  run CHETAH\r\n  Wrapper script to run CHETAH on a benchmark dataset with 5-fold cross validation,\r\n  outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n  Parameters\r\n  ----------\r\n  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n  as row names and gene names as column names.\r\n  LabelsPath : Cell population annotations file path (.csv).\r\n  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n  OutputDir : Output directory defining the path of the exported file.\r\n  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n  defining the genes order for each cross validation fold, default is NULL.\r\n  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.\r\n  \"\r\n  \r\n  Data <- read.csv(DataPath,row.names = 1)\r\n  Labels <- as.matrix(read.csv(LabelsPath))\r\n  load(CV_RDataPath)\r\n  Labels <- as.vector(Labels[,col_Index])\r\n  Data <- Data[Cells_to_Keep,]\r\n  Labels <- Labels[Cells_to_Keep]\r\n  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n    GenesOrder = read.csv(GeneOrderPath)\r\n  }\r\n  \r\n  #############################################################################\r\n  #                                CHETAH                                     #\r\n  #############################################################################\r\n  library(CHETAH)\r\n  library(SingleCellExperiment)\r\n  True_Labels_CHETAH <- list()\r\n  Pred_Labels_CHETAH <- list()\r\n  Total_Time_CHETAH <- list()\r\n  Data = t(as.matrix(Data))\r\n  \r\n  for (i in c(1:n_folds)){\r\n    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n      sce <- SingleCellExperiment(assays = list(counts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), \r\n                                  colData = data.frame(celltypes = Labels[Train_Idx[[i]]]))\r\n      \r\n      sce_test <- SingleCellExperiment(assays = list(counts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), \r\n                                       colData = data.frame(celltypes = Labels[Test_Idx[[i]]]))\r\n      start_time <- Sys.time()\r\n      sce_test <- CHETAHclassifier(input = sce_test, ref_cells = sce, n_genes = NumGenes)\r\n      end_time <- Sys.time()\r\n    }\r\n    else{\r\n      sce <- SingleCellExperiment(assays = list(counts = Data[,Train_Idx[[i]]]), \r\n                                  colData = data.frame(celltypes = Labels[Train_Idx[[i]]]))\r\n      \r\n      sce_test <- SingleCellExperiment(assays = list(counts = Data[,Test_Idx[[i]]]), \r\n                                       colData = data.frame(celltypes = Labels[Test_Idx[[i]]]))\r\n      start_time <- Sys.time()\r\n      sce_test <- CHETAHclassifier(input = sce_test, ref_cells = sce)\r\n      end_time <- Sys.time()\r\n    }\r\n    \r\n    Total_Time_CHETAH[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n    \r\n    True_Labels_CHETAH[i] <- list(Labels[Test_Idx[[i]]])\r\n    Pred_Labels_CHETAH[i] <- list(sce_test$celltype_CHETAH)\r\n  }\r\n  True_Labels_CHETAH <- as.vector(unlist(True_Labels_CHETAH))\r\n  Pred_Labels_CHETAH <- as.vector(unlist(Pred_Labels_CHETAH))\r\n  Total_Time_CHETAH <- as.vector(unlist(Total_Time_CHETAH))\r\n  write.csv(True_Labels_CHETAH,paste0(OutputDir,'/CHETAH_true.csv'),row.names = FALSE)\r\n  write.csv(Pred_Labels_CHETAH,paste0(OutputDir,'/CHETAH_pred.csv'),row.names = FALSE)\r\n  write.csv(Total_Time_CHETAH,paste0(OutputDir,'/CHETAH_total_time.csv'),row.names = FALSE)\r\n}\r\n\r\nif (args[6] == \"0\") {\r\n  run_CHETAH(args[1], args[2], args[3], args[4])\r\n} else {\r\n  run_CHETAH(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6]))\r\n}\r\n"
  },
  {
    "path": "Snakemake/Scripts/run_CaSTLe.R",
    "content": "run_CaSTLe<-function(DataPath,LabelsPath,CV_RDataPath, OutputDir, GeneOrderPath = NULL, NumGenes = NULL){\r\n  \"\r\n  run CaSTLe\r\n  Wrapper script to run CaSTLe on a benchmark dataset with 5-fold cross validation,\r\n  outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n  Parameters\r\n  ----------\r\n  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n  as row names and gene names as column names.\r\n  LabelsPath : Cell population annotations file path (.csv).\r\n  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n  OutputDir : Output directory defining the path of the exported file.\r\n  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n  defining the genes order for each cross validation fold, default is NULL.\r\n  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.\r\n  \"\r\n  \r\n  Data <- read.csv(DataPath,row.names = 1)\r\n  Labels <- as.matrix(read.csv(LabelsPath))\r\n  load(CV_RDataPath)\r\n  Labels <- as.vector(Labels[,col_Index])\r\n  Data <- Data[Cells_to_Keep,]\r\n  Labels <- Labels[Cells_to_Keep]\r\n  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n    GenesOrder = read.csv(GeneOrderPath)\r\n  }\r\n  \r\n  #############################################################################\r\n  #                                CaSTLe                                     #\r\n  #############################################################################\r\n  library(igraph)\r\n  library(xgboost)\r\n  True_Labels_Castle <- list()\r\n  Pred_Labels_Castle <- list()\r\n  Training_Time_Castle <- list()\r\n  Testing_Time_Castle <- list()\r\n  \r\n  BREAKS=c(-1, 0, 1, 6, Inf)\r\n  nFeatures = 100\r\n  \r\n  for(i in c(1:n_folds)){\r\n    # 1. Load datasets\r\n    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n      ds1 = Data[Train_Idx[[i]],as.vector(GenesOrder[c(1:NumGenes),i])+1]\r\n      ds2 = Data[Test_Idx[[i]],as.vector(GenesOrder[c(1:NumGenes),i])+1]\r\n    }\r\n    else{\r\n      ds1 = Data[Train_Idx[[i]],]\r\n      ds2 = Data[Test_Idx[[i]],]\r\n    }\r\n    \r\n    sourceCellTypes = as.factor(Labels[Train_Idx[[i]]])\r\n    targetCellTypes = as.factor(Labels[Test_Idx[[i]]])\r\n    \r\n    start_time <- Sys.time()\r\n    # 2. Unify sets, excluding low expressed genes\r\n    source_n_cells_counts = apply(ds1, 2, function(x) { sum(x > 0) } )\r\n    target_n_cells_counts = apply(ds2, 2, function(x) { sum(x > 0) } )\r\n    common_genes = intersect( colnames(ds1)[source_n_cells_counts>10], \r\n                              colnames(ds2)[target_n_cells_counts>10])\r\n    remove(source_n_cells_counts, target_n_cells_counts)\r\n    ds1 = ds1[, colnames(ds1) %in% common_genes]\r\n    ds2 = ds2[, colnames(ds2) %in% common_genes]\r\n    ds = rbind(ds1[,common_genes], ds2[,common_genes])\r\n    isSource = c(rep(TRUE,nrow(ds1)), rep(FALSE,nrow(ds2)))\r\n    remove(ds1, ds2)\r\n    \r\n    # 3. Highest mean in both source and target\r\n    topFeaturesAvg = colnames(ds)[order(apply(ds, 2, mean), decreasing = T)]\r\n    end_time <- Sys.time()\r\n    Training_Time_Castle[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n    \r\n    start_time <- Sys.time()\r\n    # for each cell - what is the most probable classification?\r\n    L = length(levels(sourceCellTypes))\r\n    targetClassification = as.data.frame(matrix(rep(0,L*sum(!isSource)), nrow=L), row.names = levels(sourceCellTypes))\r\n    \r\n    for (cellType in levels(sourceCellTypes)) {\r\n      \r\n      inSourceCellType = as.factor(ifelse(sourceCellTypes == cellType, cellType, paste0(\"NOT\",cellType)))\r\n      \r\n      # 4. Highest mutual information in source\r\n      topFeaturesMi = names(sort(apply(ds[isSource,],2,function(x) { compare(cut(x,breaks=BREAKS),inSourceCellType,method = \"nmi\") }), decreasing = T))\r\n      \r\n      # 5. Top n genes that appear in both mi and avg\r\n      selectedFeatures = union(head(topFeaturesAvg, nFeatures) , head(topFeaturesMi, nFeatures) )\r\n      \r\n      # 6. remove correlated features\r\n      tmp = cor(ds[,selectedFeatures], method = \"pearson\")\r\n      tmp[!lower.tri(tmp)] = 0\r\n      selectedFeatures = selectedFeatures[apply(tmp,2,function(x) any(x < 0.9))]\r\n      remove(tmp)\r\n      \r\n      # 7,8. Convert data from continous to binned dummy vars\r\n      # break datasets to bins\r\n      dsBins = apply(ds[, selectedFeatures], 2, cut, breaks= BREAKS)\r\n      # use only bins with more than one value\r\n      nUniq = apply(dsBins, 2, function(x) { length(unique(x)) })\r\n      # convert to dummy vars\r\n      ds0 = model.matrix(~ . , as.data.frame(dsBins[,nUniq>1]))\r\n      remove(dsBins, nUniq)\r\n      \r\n      cat(paste0(\"<h2>Classifier for \",cellType,\"</h2>\"))\r\n      \r\n      inTypeSource = sourceCellTypes == cellType\r\n      # 9. Classify\r\n      xg=xgboost(data=ds0[isSource,] , \r\n                 label=inTypeSource,\r\n                 objective=\"binary:logistic\", \r\n                 eta=0.7 , nthread=1, nround=20, verbose=0,\r\n                 gamma=0.001, max_depth=5, min_child_weight=10)\r\n      \r\n      # 10. Predict\r\n      inTypeProb = predict(xg, ds0[!isSource, ])\r\n      \r\n      targetClassification[cellType,] = inTypeProb\r\n    }\r\n    end_time <- Sys.time()\r\n    Testing_Time_Castle[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n    \r\n    True_Labels_Castle[i] <- list(Labels[Test_Idx[[i]]])\r\n    Pred_Labels_Castle[i] <- list(rownames(targetClassification)[apply(targetClassification,2,which.max)])\r\n  }\r\n  True_Labels_Castle <- as.vector(unlist(True_Labels_Castle))\r\n  Pred_Labels_Castle <- as.vector(unlist(Pred_Labels_Castle))\r\n  Training_Time_Castle <- as.vector(unlist(Training_Time_Castle))\r\n  Testing_Time_Castle <- as.vector(unlist(Testing_Time_Castle))\r\n  \r\n  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n    write.csv(True_Labels_Castle,paste('True_Labels_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Pred_Labels_Castle,paste('Pred_Labels_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Training_Time_Castle,paste('Training_Time_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Testing_Time_Castle,paste('Testing_Time_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)\r\n  }\r\n  else{\r\n    write.csv(True_Labels_Castle,'True_Labels_CaSTLe.csv',row.names = FALSE)\r\n    write.csv(Pred_Labels_Castle,'Pred_Labels_CaSTLe.csv',row.names = FALSE)\r\n    write.csv(Training_Time_Castle,'Training_Time_CaSTLe.csv',row.names = FALSE)\r\n    write.csv(Testing_Time_Castle,'Testing_Time_CaSTLe.csv',row.names = FALSE)\r\n  }\r\n  \r\n}"
  },
  {
    "path": "Snakemake/Scripts/run_Cell_BLAST.py",
    "content": "import os\r\nfrom sys import argv\r\nfrom pathlib import Path\r\nimport time as tm\r\nimport pandas as pd\r\nimport warnings\r\nwarnings.filterwarnings(\"ignore\")\r\n\r\nimport tensorflow as tf\r\ntf.logging.set_verbosity(0)\r\n\r\nimport Cell_BLAST as cb\r\nimport numpy as np\r\nfrom numpy import genfromtxt as gft\r\nimport rpy2.robjects as robjects\r\n\r\n\r\ndef run_Cell_BLAST(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0):\r\n    '''\r\n    run Cell_BLAST\r\n    Wrapper script to run Cell_BLAST on a benchmark dataset with 5-fold cross validation,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    '''\r\n        \r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n    col = col - 1 \r\n    test_ind = np.array(robjects.r['Test_Idx'])\r\n    train_ind = np.array(robjects.r['Train_Idx'])\r\n    \r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n\r\n    # read the data and labels\r\n    data_old = cb.data.ExprDataSet.read_table(DataPath,orientation=\"cg\", sep=\",\", index_col = 0, header = 0, sparsify = True).normalize()\r\n    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)\r\n    \r\n    data = cb.data.ExprDataSet(data_old.exprs[tokeep],data_old.obs.iloc[tokeep],data_old.var,data_old.uns)\r\n\r\n    labels = gft(LabelsPath, dtype = \"str\", skip_header = 1, delimiter = \",\", usecols = col)      \r\n    labels = labels[tokeep]\r\n   \r\n    truelab = []\r\n    pred = []\r\n    tr_time = []\r\n    ts_time = []\r\n    \r\n    for i in range(np.squeeze(nfolds)):\r\n        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1\r\n        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1\r\n\r\n        train=data[train_ind_i,:]\r\n        test=data[test_ind_i,:]\r\n        y_train = labels[train_ind_i]\r\n        y_test = labels[test_ind_i]\r\n        \r\n        if (NumGenes > 0):\r\n            feat_to_use = features.iloc[0:NumGenes,i]\r\n            train = train[:,feat_to_use]\r\n            test = test[:,feat_to_use]\r\n\r\n        \r\n        train.obs['cell_type'] = y_train\r\n                \r\n        start = tm.time()\r\n                \r\n        # reduce dimensions\r\n        num_epoch = 50\r\n        models = []\r\n    \r\n        for j in range(4):\r\n            models.append(cb.directi.fit_DIRECTi(train, epoch=num_epoch, patience=10, random_seed = j, path=\"%d\" % j))\r\n    \r\n        # train model\r\n        blast = cb.blast.BLAST(models, train).build_empirical()\r\n        tr_time.append(tm.time()-start)\r\n        \r\n        # predict labels\r\n        start = tm.time()\r\n        test_pred = blast.query(test).annotate('cell_type')\r\n        ts_time.append(tm.time()-start)\r\n\r\n        truelab.extend(y_test)\r\n        pred.extend(test_pred.values)\r\n    \r\n    #write results    \r\n    truelab = pd.DataFrame(truelab)\r\n    pred = pd.DataFrame(pred)\r\n            \r\n    tr_time = pd.DataFrame(tr_time)\r\n    ts_time = pd.DataFrame(ts_time)\r\n\r\n    truelab.to_csv(str(Path(OutputDir+\"/Cell_BLAST_true.csv\")),index = False)\r\n    pred.to_csv(str(Path(OutputDir+\"/Cell_BLAST_pred.csv\")),index = False)\r\n    tr_time.to_csv(str(Path(OutputDir+\"/Cell_BLAST_training_time.csv\")), index = False)\r\n    ts_time.to_csv(str(Path(OutputDir+\"/Cell_BLAST_test_time.csv\")),index = False)\r\n\r\n\r\nrun_Cell_BLAST(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))\r\n"
  },
  {
    "path": "Snakemake/Scripts/run_DigitalCellSorter.py",
    "content": "import numpy as np\r\nimport pandas as pd\r\nimport scripts.DigitalCellSorter as DigitalCellSorter\r\nimport os\r\nimport time as tm\r\nimport rpy2.robjects as robjects\r\n\r\ndef run_DigitalCellSorter(DataPath, LabelsPath, CV_RDataPath, GeneListPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0):\r\n    '''\r\n    run DigitalCellSorter\r\n    Wrapper script to run DigitalCellSorter on a benchmark dataset using a predefined genelist,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.  \r\n  \r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    GeneListPath : Data file path to the genest.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    '''\r\n        \r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n    col = col - 1\r\n    \r\n    # read the data\r\n    data = pd.read_csv(DataPath,index_col=0,sep=',')\r\n    data = data.iloc[tokeep]\r\n    \r\n    truelab = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)\r\n    truelab = truelab.iloc[tokeep]\r\n\r\n\r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n        feat_to_use = features.iloc[0:NumGenes,0]\r\n        data = data.iloc[:,feat_to_use]\r\n        \r\n    data = data.transpose()\r\n    \r\n    # number of different cell types in the data?\r\n    n_clusters = 8\r\n    AvailableCPUsCount = 1\r\n    N_samples_for_distribution = 10000\r\n        \r\n    start = tm.time()\r\n    pred = DigitalCellSorter.DigitalCellSorter().Process(data, 'DigitalCellSorter_Zhang', \r\n                                                saveDir = OutputDir, \r\n                                                geneListFileName = GeneListPath,\r\n                                                N_samples_for_distribution = N_samples_for_distribution,\r\n                                                AvailableCPUsCount = AvailableCPUsCount,\r\n                                                clusterIndex=None,\r\n                                                clusterName=None,\r\n                                                n_clusters=n_clusters)\t\r\n    runtime = tm.time() - start \r\n    \r\n    os.chdir(OutputDir)\r\n    \r\n    results = pd.read_excel('DigitalCellSorter_Zhang_voting.xlsx',header=0,index_col=None, usecols=[11])\r\n\r\n    prediction = np.zeros(np.shape(pred), dtype='>U10')\r\n    \r\n    for i in range(len(results)):\r\n    \tprediction[np.where(pred == i)] = results.values[i]\r\n    \r\n    prediction = pd.DataFrame(prediction)\r\n        \r\n    if (NumGenes == 0):  \r\n        truelab.to_csv(\"DigitalCellSorter_True_Labels.csv\", index = False)\r\n        prediction.to_csv(\"DigitalCellSorter_Pred_Labels.csv\", index = False)\r\n        with open(\"DigitalCellSorter_Total_Time.csv\", 'w') as f:\r\n            f.write(\"%f\\n\" % runtime)\r\n    else:\r\n        truelab.to_csv(\"DigitalCellSorter_\" + str(NumGenes) + \"_True_Labels.csv\", index = False)\r\n        prediction.to_csv(\"DigitalCellSorter_\" + str(NumGenes) + \"_Pred_Labels.csv\", index = False)\r\n        with open(\"DigitalCellSorter_\" + str(NumGenes) + \"_Total_Time.csv\", 'w') as f:\r\n            f.write(\"%f\\n\" % runtime)\r\n\r\n            \r\n\r\n        "
  },
  {
    "path": "Snakemake/Scripts/run_Garnett_CV.R",
    "content": "args <- commandArgs(TRUE)\r\n\r\nrun_Garnett_CV <- function(DataPath, LabelsPath, CV_RDataPath, GenesPath, MarkerPath, OutputDir, Human){\r\n  \"\r\n  run Garnett\r\n  Wrapper script to run Garnett on a benchmark dataset with 5-fold cross validation,\r\n  outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n  Parameters\r\n  ----------\r\n  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n  as row names and gene names as column names.\r\n  LabelsPath : Cell population annotations file path (.csv).\r\n  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n  GenesPath : Path to the file with the genenames\r\n  MarkerPath : Path to the file with marker genes\r\n  OutputDir : Output directory defining the path of the exported file.\r\n  Human : boolean indicating whether the dataset is human (TRUE) or mouse (FALSE)\r\n  \"\r\n\r\n  # load needed libraries\r\n  library(garnett)\r\n  if (Human) {\r\n    library(org.Hs.eg.db)\r\n  } else {\r\n    library(org.Mm.eg.db)\r\n  }\r\n  \r\n  # load the CVFile\r\n  load(CV_RDataPath)\r\n  \r\n  # read the labels\r\n  labels <- as.matrix(read.csv(LabelsPath))\r\n  labels <- as.vector(labels[,col_Index])\r\n  labels <- labels[Cells_to_Keep]\r\n  \r\n  # read the data\r\n  mat <- read.table(DataPath, sep = \",\")\r\n  data <- mat[-1,-1]\r\n  data <- data[Cells_to_Keep,]\r\n  data <- t(data) #ensure that the genes are rows, and the cells are columns\r\n  \r\n  cells <- mat[-1,1]\r\n  cells <- cells[Cells_to_Keep]\r\n  \r\n  # read the genefile \r\n  fdata <- read.table(GenesPath)\r\n  names(fdata) <- 'gene_short_name'\r\n  row.names(fdata) <- fdata$gene_short_name\r\n  fd <- new(\"AnnotatedDataFrame\", data = fdata)\r\n  \r\n  true_labels <- list()\r\n  pred_labels <- list()\r\n  train_time <- list()\r\n  test_time <- list()\r\n  \r\n  for (i in c(1:n_folds)){\r\n    lab_train = labels[Train_Idx[[i]]]\r\n    lab_test = labels[Test_Idx[[i]]]\r\n    \r\n    train = data[,Train_Idx[[i]]]\r\n    test = data[,Test_Idx[[i]]]\r\n    \r\n    cells_train = cells[Train_Idx[[i]]]\r\n    cells_test = cells[Test_Idx[[i]]]\r\n    \r\n    pdata_train = data.frame(cells_train)\r\n    pdata_test = data.frame(cells_test)\r\n    \r\n    row.names(train) <- row.names(fdata)\r\n    row.names(test) <- row.names(fdata)\r\n    colnames(train) <- row.names(pdata_train)\r\n    colnames(test) <- row.names(pdata_test)\r\n    \r\n    pd_train <- new(\"AnnotatedDataFrame\", data = pdata_train)\r\n    pd_test <- new(\"AnnotatedDataFrame\", data = pdata_test)\r\n    \r\n    pbmc_cds_train <- newCellDataSet(as(train, \"dgCMatrix\"), phenoData = pd_train, featureData = fd)\r\n    pbmc_cds_test <- newCellDataSet(as(test, \"dgCMatrix\"), phenoData = pd_test, featureData = fd)\r\n    \r\n    pbmc_cds_train <- estimateSizeFactors(pbmc_cds_train)\r\n    pbmc_cds_test <- estimateSizeFactors(pbmc_cds_test)\r\n    \r\n    # training\r\n    start_train <- Sys.time()\r\n    \r\n    if (Human){\r\n      pbmc_classifier <- train_cell_classifier(cds = pbmc_cds_train, \r\n                                               marker_file = MarkerPath,\r\n                                               db=org.Hs.eg.db,\r\n                                               cds_gene_id_type = \"SYMBOL\",\r\n                                               num_unknown = 50,\r\n                                               marker_file_gene_id_type = \"SYMBOL\")\r\n    } else {\r\n      pbmc_classifier <- train_cell_classifier(cds = pbmc_cds_train, \r\n                                               marker_file = MarkerPath,\r\n                                               db=org.Mm.eg.db,\r\n                                               cds_gene_id_type = \"SYMBOL\",\r\n                                               num_unknown = 50,\r\n                                               marker_file_gene_id_type = \"SYMBOL\")\r\n      \r\n    }\r\n    end_train <- Sys.time()\r\n    train_time[i] <- as.numeric(end_train - start_train)\r\n    \r\n    # testing\r\n    start_test <- Sys.time()\r\n    \r\n    if (Human) {\r\n      pbmc_cds_test <- classify_cells(pbmc_cds_test, \r\n                                      pbmc_classifier, \r\n                                      db = org.Hs.eg.db, \r\n                                      cluster_extend = TRUE,\r\n                                      cds_gene_id_type = \"SYMBOL\")\r\n    } else {\r\n      pbmc_cds_test <- classify_cells(pbmc_cds_test, \r\n                                      pbmc_classifier, \r\n                                      db = org.Mm.eg.db, \r\n                                      cluster_extend = TRUE,\r\n                                      cds_gene_id_type = \"SYMBOL\")\r\n    }\r\n    end_test <- Sys.time()\r\n    test_time[i] <- as.numeric(end_test - start_test)\r\n    \r\n    true_labels[i] <- list(lab_test)\r\n    pred_labels[i] <- list(pData(pbmc_cds_test)$cluster_ext_type)\r\n    \r\n    \r\n  }\r\n  \r\n  true_labels <- as.vector(unlist(true_labels))\r\n  pred_labels <- as.vector(unlist(pred_labels))\r\n  train_time <- as.vector(unlist(train_time))\r\n  test_time <- as.vector(unlist(test_time))\r\n\r\n  write.csv(true_labels,paste0(OutputDir,'/Garnett_CV_true.csv'),row.names = FALSE)\r\n  write.csv(pred_labels,paste0(OutputDir,'/Garnett_CV_pred.csv'),row.names = FALSE)\r\n  write.csv(train_time,paste0(OutputDir,'/Garnett_CV_training_time.csv'),row.names = FALSE)\r\n  write.csv(test_time,paste0(OutputDir,'/Garnett_CV_test_time.csv'),row.names = FALSE)\r\n\r\n}\r\n\r\nrun_Garnett_CV(args[1], args[2], args[3], args[4], args[5], args[6], args[7])\r\n"
  },
  {
    "path": "Snakemake/Scripts/run_Garnett_Pretrained.R",
    "content": "args <- commandArgs(TRUE)\r\n\r\nrun_Garnett_Pretrained <- function(DataPath, LabelsPath, GenesPath, CV_RDataPath, ClassifierPath, OutputDir, Human){\r\n  \"\r\n  run Garnett\r\n  Wrapper script to run Garnett on a benchmark dataset with a pretrained classifier,\r\n  outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n  Parameters\r\n  ----------\r\n  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n  as row names and gene names as column names.\r\n  LabelsPath : Cell population annotations file path (.csv).\r\n  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n  GenesPath : Path to the file with the genenames\r\n  ClassifierPath : Path to the pretrained classifier\r\n  OutputDir : Output directory defining the path of the exported file.\r\n  Human : boolean indicating whether the dataset is human (TRUE) or mouse (FALSE)\r\n  \"\r\n  # load needed libraries\r\n  library(garnett)\r\n  \r\n  if (Human) {\r\n    library(org.Hs.eg.db)\r\n  } else {\r\n    library(org.Mm.eg.db)\r\n  }\r\n  \r\n  # load data, genes, and marker file\r\n  load(CV_RDataPath)\r\n  \r\n  load(ClassifierPath)\r\n  \r\n  labels <- as.matrix(read.csv(LabelsPath))\r\n  labels <- labels[Cells_to_Keep]\r\n  \r\n  mat <- read.table(DataPath, sep = \",\")\r\n  data <- mat[-1,-1]\r\n  data <- data[Cells_to_Keep,]\r\n  data <- t(data) #ensure that the genes are rows, and the cells are columns\r\n  \r\n  barcodes <- mat[-1,1]\r\n  \r\n  pdata = data.frame(barcodes)\r\n  fdata <- read.table(GenesPath)\r\n  names(fdata) <- 'gene_short_name'\r\n  row.names(fdata) <- fdata$gene_short_name\r\n  \r\n  row.names(data) <- row.names(fdata)\r\n  colnames(data) <- row.names(pdata)\r\n  \r\n  pd <- new(\"AnnotatedDataFrame\", data = pdata)\r\n  fd <- new(\"AnnotatedDataFrame\", data = fdata)\r\n  pbmc_cds <- newCellDataSet(as(data, \"dgCMatrix\"),\r\n                             phenoData = pd,\r\n                             featureData = fd)\r\n  \r\n  start_time <- Sys.time()\r\n  \r\n  pbmc_cds <- estimateSizeFactors(pbmc_cds)\r\n  \r\n  if (Human){\r\n    pbmc_cds <- classify_cells(pbmc_cds, hsPBMC, db = org.Hs.eg.db, cluster_extend = TRUE, cds_gene_id_type = \"SYMBOL\")\r\n  } else {\r\n    pbmc_cds <- classify_cells(pbmc_cds, mmLung, db = org.Mm.eg.db, cluster_extend = TRUE, cds_gene_id_type = \"SYMBOL\")\r\n  }\r\n  \r\n  end_time <- Sys.time()\r\n  \r\n  test_time <- as.numeric(end_time - start_time)\r\n\r\n  write.table(pData(pbmc_cds)$cluster_ext_type,\r\n              file = paste0(OutputDir, \"/Garnett_Pretrained_pred.csv\"), append = FALSE, quote = TRUE, sep = \"\\t\",\r\n              eol = \"\\n\", na = \"NA\", dec = \".\", row.names = FALSE,\r\n              qmethod = c(\"escape\", \"double\"),\r\n              fileEncoding = \"\")\r\n\r\n  write.csv(labels,paste0(OutputDir,\"/Garnett_Pretrained_true.csv\"), row.names = FALSE)\r\n  write.csv(test_time,paste0(OutputDir,'/Garnett_Pretrained_test_time.csv'),row.names = FALSE)\r\n}\r\n\r\nrun_Garnett_Pretrained(args[1], args[2], args[3], args[4], args[5], args[6], args[7])\r\n"
  },
  {
    "path": "Snakemake/Scripts/run_LAmbDA.py",
    "content": "# -*- coding: utf-8 -*-\r\n\"\"\"\r\nCreated on Thu May 23 13:51:15 2019\r\n\r\n@author: Lieke\r\n\"\"\"\r\n\r\nimport os \r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nimport rpy2.robjects as robjects\r\nimport tensorflow as tf\r\nimport math\r\nimport scipy.io as sio\r\nimport optunity as opt\r\nfrom tensorflow.contrib.tensor_forest.python import tensor_forest\r\nfrom tensorflow.python.ops import resources\r\n\r\n\r\ndef run_LAmbDA(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0):\r\n    '''\r\n    run LAmbDA classifier\r\n    Wrapper script to run LAmbDA on a benchmark dataset with 5-fold cross validation,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    '''\r\n        \r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n    col = col - 1 \r\n    test_ind = np.array(robjects.r['Test_Idx'])\r\n    train_ind = np.array(robjects.r['Train_Idx'])\r\n\r\n    # read the data\r\n    data = pd.read_csv(DataPath,index_col=0,sep=',')\r\n    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)\r\n    \r\n    labels = labels.iloc[tokeep]\r\n    data = data.iloc[tokeep]\r\n    \r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n    \r\n    # folder with results\r\n    os.chdir(OutputDir)\r\n                \r\n    tr_time=[]\r\n    ts_time=[]\r\n    truelab = np.zeros([len(labels),1],dtype = int)\r\n    predlab = np.zeros([len(labels),1],dtype = int)\r\n        \r\n    for i in range(np.squeeze(nfolds)):\r\n        global X, Y, Gnp, Dnp, train, test, prt, cv\r\n        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1\r\n        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1\r\n                \r\n        X = np.array(data) \r\n        if (NumGenes > 0):\r\n            X = np.log2(X/10+1)\r\n            feat_to_use = features.iloc[0:NumGenes,i]\r\n            X = X[:,feat_to_use]\r\n        else:\r\n            X = np.log2(np.transpose(select_feats(np.transpose(X),0.5,80))/10+1)\r\n    \r\n        uniq = np.unique(labels)\r\n        Y = np.zeros([len(labels),len(uniq)],int)\r\n        \r\n        for j in range(len(uniq)):\r\n            Y[np.where(labels == uniq[j])[0],j] = 1\r\n    \r\n        Y = np.array(Y)\r\n        \r\n        Gnp = np.zeros([len(uniq),len(uniq)],int)\r\n        np.fill_diagonal(Gnp,1)\r\n        Gnp = np.array(Gnp)\r\n        \r\n        Dnp = np.ones([len(uniq),1],int)\r\n        Dnp = np.array(Dnp)\r\n        \r\n        train_samp = int(np.floor(0.75*len(train_ind_i)))\r\n        test_samp = len(train_ind_i) - train_samp\r\n        perm = np.random.permutation(len(train_ind_i))\r\n        train = perm[0:train_samp]\r\n        test = perm[train_samp:test_samp+1]\r\n        \r\n        while(np.sum(np.sum(Y[train,:],0)<5)>0):\r\n            perm = np.random.permutation(X.shape[0])\r\n            train = perm[0:train_samp+1]\r\n            test = perm[train_samp+1:train_samp+test_samp+1]\r\n        \r\n        cv = i\r\n        optunity_it = 0\r\n        prt = False\r\n        opt_params = None\r\n                    \r\n        start=tm.time()\r\n        opt_params, _, _ = opt.minimize(run_LAmbDA2,solver_name='sobol', gamma=[0.8,1.2], delta=[0.05,0.95], tau=[10.0,11.0], prc_cut=[20,50], bs_prc=[0.2,0.6], num_trees=[10,200], max_nodes=[100,1000], num_evals=50)\r\n        tr_time.append(tm.time()-start)\r\n        \r\n        print(\"Finished training!\")\r\n        \r\n        prt = True\r\n        train = train_ind_i\r\n        test = test_ind_i\r\n        \r\n        start=tm.time()\r\n        err = run_LAmbDA2(opt_params['gamma'], opt_params['delta'], opt_params['tau'], opt_params['prc_cut'], opt_params['bs_prc'], opt_params['num_trees'], opt_params['max_nodes'])\r\n        ts_time.append(tm.time()-start)\r\n        \r\n        tf.reset_default_graph();\r\n        \r\n        predfile = 'preds_cv' + str(cv) + '.mat'\r\n        truefile = 'truth_cv' + str(cv) + '.mat'\r\n        pred = sio.loadmat(predfile)\r\n        truth = sio.loadmat(truefile)\r\n        \r\n        pred = pred['preds']\r\n        truth = truth['labels']\r\n        \r\n        pred_ind = np.argmax(pred,axis=1)\r\n        truth_ind = np.argmax(truth,axis=1)\r\n        \r\n        predlab[test_ind_i,0] = pred_ind\r\n        truelab[test_ind_i,0] = truth_ind\r\n            \r\n                \r\n    truelab = pd.DataFrame(truelab)\r\n    predlab = pd.DataFrame(predlab)\r\n        \r\n    tr_time = pd.DataFrame(tr_time)\r\n    ts_time = pd.DataFrame(ts_time)\r\n        \r\n    if (NumGenes == 0):  \r\n        truelab.to_csv(\"LAmbDA_True_Labels.csv\", index = False)\r\n        predlab.to_csv(\"LAmbDA_Pred_Labels.csv\", index = False)\r\n        tr_time.to_csv(\"LAmbDA_Training_Time.csv\", index = False)\r\n        ts_time.to_csv(\"LAmbDA_Testing_Time.csv\", index = False)\r\n    else:\r\n        truelab.to_csv(\"LAmbDA_\" + str(NumGenes) + \"_True_Labels.csv\", index = False)\r\n        predlab.to_csv(\"LAmbDA_\" + str(NumGenes) + \"_Pred_Labels.csv\", index = False)\r\n        tr_time.to_csv(\"LAmbDA_\" + str(NumGenes) + \"_Training_Time.csv\", index = False)\r\n        ts_time.to_csv(\"LAmbDA_\" + str(NumGenes) + \"_Testing_Time.csv\", index = False)\r\n\r\n\r\n##### Functions copied from LAmbDA's Github\r\ndef wt_cutoff(colnum,cutoff,Gtmp,gamma):\r\n\trowsums = np.sum(Gtmp,axis=1);\r\n\treturn(math.ceil(cutoff*(math.log((max(rowsums)/rowsums[colnum])+1,2)**gamma)))\r\n\r\ndef resample(prc_cut,Y,Gtmp,train,gamma):\r\n\tadd = list()\r\n\trem = list()\r\n\tcolsums = np.sum(Y[train,:],axis=0);\r\n\tcutoff = math.ceil(np.percentile(colsums,prc_cut));\r\n\tfor i in range(len(colsums)):\r\n\t\tif colsums[i] == 0:\r\n\t\t\tpass\r\n\t\telif colsums[i] < wt_cutoff(i,cutoff,Gtmp,gamma):\r\n\t\t\tidx = np.squeeze(np.array(np.where(Y[train,i]>=1)));\r\n\t\t\tchoice = np.random.choice(train[idx],int(wt_cutoff(i,cutoff,Gtmp,gamma)-colsums[i]))\r\n\t\t\tadd = add + choice.tolist();\r\n\t\telif colsums[i] == wt_cutoff(i,cutoff,Gtmp,gamma):\r\n\t\t\tpass\r\n\t\telse:\r\n\t\t\tidx = np.squeeze(np.array(np.where(Y[train,i]>=1)));\r\n\t\t\tchoice = np.random.choice(train[idx],int(colsums[i]-wt_cutoff(i,cutoff,Gtmp,gamma)),replace=False)\r\n\t\t\trem = rem + choice.tolist()\r\n\treturn np.concatenate((list([val for val in train if val not in rem]),add));\r\n\r\ndef select_feats(Xtmp,num_zero_prc_cut,var_prc_cut):\r\n\t#*********************************************************************\r\n\t# remove features with many zeros\r\n\tnum_feat_zeros = np.sum(Xtmp==0,axis=1);\r\n\tXtmp = Xtmp[num_feat_zeros<num_zero_prc_cut*Xtmp.shape[1],:]\r\n\t#*********************************************************************\r\n\t# remove features with low variance\r\n\tfeat_vars = np.var(Xtmp,axis=1)\r\n\tXtmp = Xtmp[feat_vars>np.percentile(feat_vars,var_prc_cut),:]\r\n\treturn(Xtmp)\r\n\r\ndef get_yn(predict,ys,delta,tau,output_feats):\r\n\tD = tf.cast(Dnp, tf.float32);\r\n\tG = tf.cast(Gnp, tf.float32);\r\n\tys = tf.cast(ys, tf.float32);\r\n\t#print(\"start\")\r\n\tCm = tf.matmul(tf.transpose(tf.matmul(ys,D)),predict+0.1)/tf.reshape(tf.reduce_sum(tf.transpose(tf.matmul(ys,D)),1),(-1,1));\r\n\t#print(\"1\")\r\n\tmCm = tf.reshape(tf.reduce_mean(tf.cast(tf.matmul(tf.transpose(D),G)>0,tf.float32)*Cm,1),(-1,1));\r\n\t#print(\"2\")\r\n\tyw = tf.multiply(predict+0.1,tf.matmul(tf.matmul(ys,D),tf.pow(mCm/Cm,tau)));\r\n\t#print(\"3\")\r\n\tye = tf.multiply(tf.matmul(ys,G),yw);\r\n\t#print(\"4\")\r\n\tyt = tf.matmul(ys,tf.matmul(tf.transpose(ys),ye));\r\n\t#print(\"5\")\r\n\tya = (delta*yt)+((1-delta)*ye)\r\n\t#print(\"6\")\r\n\tyn = tf.cast(tf.one_hot(tf.argmax(ya,axis=1),output_feats), dtype=tf.float32)\r\n\t#print(\"7\")\r\n\treturn(yn)\r\n\r\ndef get_yi(rowsums,G2,ys):\r\n\tG2 = tf.cast(G2, tf.float32);\r\n\tys = tf.cast(ys, tf.float32);\r\n\tyi = tf.cast(tf.matmul(ys,G2), dtype=tf.float32);\r\n\treturn(yi)\r\n\r\ndef run_LAmbDA2(gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes):\r\n\tglobal X, Y, Gnp, Dnp, train, test, prt, cv\r\n\tD = tf.cast(Dnp, tf.float32);\r\n\tG = tf.cast(Gnp, tf.float32);\r\n\t#optunity_it = optunity_it+1;\r\n\tnum_trees = int(num_trees);\r\n\tmax_nodes = int(max_nodes);\r\n\tprc_cut = int(np.ceil(prc_cut));\r\n\tprint(\"gamma=%.4f, delta=%.4f, tau=%.4f, prc_cut=%i, bs_prc=%.4f, num_trees=%i, max_nodes=%i\" % (gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes))\r\n\tinput_feats = X.shape[1];\r\n\tnum_labls = G.shape.as_list();\r\n\toutput_feats = num_labls[1];\r\n\t#print(output_feats)\r\n\tnum_labls = num_labls[0];\r\n\trowsums = np.sum(Gnp,axis=1);\r\n\ttrain2 = resample(prc_cut, Y, Gnp, train, gamma);\t\t\t\t# Bug??\r\n\tbs = int(np.ceil(bs_prc*train2.size))\r\n\txs = tf.placeholder(tf.float32, [None,input_feats])\r\n\t#ys = tf.placeholder(tf.float32, [None,num_labls])\r\n\tyin = tf.placeholder(tf.int32, [None])\r\n\tprint(\"Vars loaded xs and ys created\")\r\n\thparams = tensor_forest.ForestHParams(num_classes=output_feats,\r\n\t\t\t\t\t\t\t\t\tnum_features=input_feats,\r\n\t\t\t\t\t\t\t\t\tnum_trees=num_trees,\r\n\t\t\t\t\t\t\t\t\tmax_nodes=max_nodes).fill()\r\n\tprint(\"Tensor forest hparams created\")\t\t\t\t\t\t\t\t\r\n\tforest_graph = tensor_forest.RandomForestGraphs(hparams)\r\n\tprint(\"Tensor forest graph created\")\r\n\ttrain_op = forest_graph.training_graph(xs, yin)\r\n\tloss_op = forest_graph.training_loss(xs, yin)\r\n\tprint(\"Loss and train ops created\")\r\n\tpredict, _, _ = forest_graph.inference_graph(xs)\r\n\tprint(\"Tensor forest variables created through predict\")\r\n\taccuracy_op = tf.reduce_mean(tf.reduce_sum(tf.square(tf.one_hot(yin,output_feats)-predict),reduction_indices=[1]))\r\n\tprint(tf.reduce_sum(tf.square(tf.one_hot(yin,output_feats)-predict),reduction_indices=[1]))\r\n\t#predict = tf.one_hot(pred);\r\n\tprint(\"Lambda specific variables created\")\r\n\t# Creating training and testing steps\r\n\tG2 = np.copy(Gnp);\r\n\tG2[rowsums>1,:] = 0;\r\n\tYI = np.matmul(Y,G2);\r\n\tYIrs = np.sum(YI,axis=1);\r\n\ttrainI = train2[np.in1d(train2,np.where(YIrs==1))];\r\n\tprint(\"data type trainI,\",trainI.dtype)\r\n\ttestI = test[np.in1d(test,np.where(YIrs==1))];\r\n\tprint(\"trainI testI created\")\r\n\t#init_vars=tf.global_variables_initializer()\r\n\tinit_vars = tf.group(tf.global_variables_initializer(),\r\n\tresources.initialize_resources(resources.shared_resources()))\r\n\tsess = tf.Session()\r\n\tsess.run(init_vars)\r\n\tprint(\"Session started\")\r\n\t#beep = sess.run(predict,feed_dict={xs:X[1:100,:]});\r\n\t#beep = sess.run(predict,feed_dict={xs:X[train2[0:bs],:]});\r\n\ttensor_trainI = {xs: X[trainI, :], yin: sess.run(tf.argmax(get_yi(rowsums,G2,Y[trainI, :]),axis=1))}\r\n\tprint(\"tensor_trainI made\")\r\n\ttensor_testI = {xs: X[testI, :], yin: sess.run(tf.argmax(get_yi(rowsums,G2,Y[testI, :]),axis=1))}\r\n\tprint(\"tensor_testI made\")\r\n\ttensor_train = {xs: X[train2[0:bs], :], yin: sess.run(tf.argmax(get_yn(sess.run(predict,feed_dict={xs:X[train2[0:bs],:]}),Y[train2[0:bs], :],delta,tau,output_feats),axis=1))}\r\n\tprint(\"tensor_train made\")\r\n\ttensor_test = {xs: X[test, :], yin: sess.run(tf.argmax(get_yn(sess.run(predict,feed_dict={xs:X[test,:]}),Y[test, :],delta,tau,output_feats),axis=1))}\r\n\tprint(\"tensor_test made\")\r\n\t#**********************************\r\n\t#print(\"Loss and training steps created with sample tensors\")\r\n\t# Setting params and initializing\r\n\tprint(\"Beginning iterations\")\r\n\t# Starting training iterations\r\n\tprint(X.shape)\r\n\tfor i in range(1,101):\r\n\t\tif i < 50:\r\n\t\t\tsess.run(train_op, feed_dict=tensor_trainI)\r\n\t\t\t#print(\"ran train op\")\r\n\t\t\tif i % 10 == 0:\r\n\t\t\t\tprint(str(sess.run(accuracy_op, feed_dict=tensor_trainI)) + ' ' + str(sess.run(accuracy_op, feed_dict=tensor_testI)))\r\n\t\telse:\r\n\t\t\tsess.run(train_op, feed_dict=tensor_train)\r\n\t\t\tif i % 10 == 0:\r\n\t\t\t\tprint(str(sess.run(accuracy_op, feed_dict=tensor_train)) + ' ' + str(sess.run(accuracy_op, feed_dict=tensor_test)))\r\n\t\t\telif i % 10 == 0:\r\n\t\t\t\tnp.random_shuffle(train2);\r\n\t\t\t\ttensor_train = {xs: X[train2[0:bs], :], yin: sess.run(get_yn(sess.run(predict,feed_dict={xs:X[train2[0:bs],:]}),Y[train2[0:bs], :],delta,tau,output_feats))}\r\n\tif prt:\r\n\t\tblah = sess.run(predict, feed_dict=tensor_test);\r\n\t\tsio.savemat('preds_cv' + str(cv) + '.mat', {'preds': blah});\r\n\t\tsio.savemat('truth_cv' + str(cv) + '.mat', {'labels': Y[test, :]});\r\n\tacc = sess.run(accuracy_op, feed_dict=tensor_test) \r\n\tprint(\"loss1=%.4f, gamma=%.4f, delta=%.4f, tau=%.4f, prc_cut=%i, bs_prc=%.4f, num_trees=%i, max_nodes=%i\" % (acc, gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes))\r\n\ttf.reset_default_graph();\r\n\treturn(acc)\r\n"
  },
  {
    "path": "Snakemake/Scripts/run_LDA.py",
    "content": "import os\r\nfrom sys import argv\r\nfrom pathlib import Path\r\n\r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\r\nimport rpy2.robjects as robjects\r\n\r\n\r\ndef run_LDA(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0):\r\n    '''\r\n    run baseline classifier: LDA\r\n    Wrapper script to run LDA classifier on a benchmark dataset with 5-fold cross validation,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n\r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes\r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection,\r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    '''\r\n\r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n    col = col - 1\r\n    test_ind = np.array(robjects.r['Test_Idx'])\r\n    train_ind = np.array(robjects.r['Train_Idx'])\r\n\r\n    # read the data\r\n    data = pd.read_csv(DataPath,index_col=0,sep=',')\r\n    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)\r\n\r\n    labels = labels.iloc[tokeep]\r\n    data = data.iloc[tokeep]\r\n\r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n\r\n    # normalize data\r\n    data = np.log1p(data)\r\n\r\n    Classifier = LinearDiscriminantAnalysis()\r\n\r\n    tr_time=[]\r\n    ts_time=[]\r\n    truelab = []\r\n    pred = []\r\n\r\n    for i in range(np.squeeze(nfolds)):\r\n        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1\r\n        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1\r\n\r\n        train=data.iloc[train_ind_i]\r\n        test=data.iloc[test_ind_i]\r\n        y_train=labels.iloc[train_ind_i]\r\n        y_test=labels.iloc[test_ind_i]\r\n\r\n        if (NumGenes > 0):\r\n            feat_to_use = features.iloc[0:NumGenes,i]\r\n            train = train.iloc[:,feat_to_use]\r\n            test = test.iloc[:,feat_to_use]\r\n\r\n        start=tm.time()\r\n        Classifier.fit(train, y_train)\r\n        tr_time.append(tm.time()-start)\r\n\r\n        start=tm.time()\r\n        predicted = Classifier.predict(test)\r\n        ts_time.append(tm.time()-start)\r\n\r\n        truelab.extend(y_test.values)\r\n        pred.extend(predicted)\r\n\r\n    truelab = pd.DataFrame(truelab)\r\n    pred = pd.DataFrame(pred)\r\n\r\n    tr_time = pd.DataFrame(tr_time)\r\n    ts_time = pd.DataFrame(ts_time)\r\n\r\n    OutputDir = Path(OutputDir)\r\n    truelab.to_csv(str(OutputDir / Path(\"LDA_true.csv\")),\r\n                   index = False)\r\n    pred.to_csv(str(OutputDir / Path(\"LDA_pred.csv\")),\r\r\n                index = False)\r\r\n    tr_time.to_csv(str(OutputDir / Path(\"LDA_training_time.csv\")),\r\n                   index = False)\r\n    ts_time.to_csv(str(OutputDir / Path(\"LDA_test_time.csv\")),\r\n                   index = False)\r\n\r\nrun_LDA(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))\r\n"
  },
  {
    "path": "Snakemake/Scripts/run_LDA_rejection.py",
    "content": "import os\r\nfrom sys import argv\r\nfrom pathlib import Path\r\n\r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nfrom sklearn.discriminant_analysis import LinearDiscriminantAnalysis\r\nimport rpy2.robjects as robjects\r\n\r\n\r\ndef run_LDA_rejection(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0, Threshold = 0.7):\r\n    '''\r\n    run baseline classifier: LDA\r\n    Wrapper script to run LDA classifier on a benchmark dataset with 5-fold cross validation,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n\r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes\r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection,\r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    Threshold : Threshold used when rejecting the genes, default is 0.7.\r\n    '''\r\n\r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n    col = col - 1\r\n    test_ind = np.array(robjects.r['Test_Idx'])\r\n    train_ind = np.array(robjects.r['Train_Idx'])\r\n\r\n    # read the data\r\n    data = pd.read_csv(DataPath,index_col=0,sep=',')\r\n    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)\r\n\r\n    labels = labels.iloc[tokeep]\r\n    data = data.iloc[tokeep]\r\n\r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n\r\n    # normalize data\r\n    data = np.log1p(data)\r\n\r\n    Classifier = LinearDiscriminantAnalysis()\r\n\r\n    tr_time=[]\r\n    ts_time=[]\r\n    truelab = []\r\n    pred = []\r\n\r\n    for i in range(np.squeeze(nfolds)):\r\n        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1\r\n        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1\r\n\r\n        train=data.iloc[train_ind_i]\r\n        test=data.iloc[test_ind_i]\r\n        y_train=labels.iloc[train_ind_i]\r\n        y_test=labels.iloc[test_ind_i]\r\n\r\n        if (NumGenes > 0):\r\n            feat_to_use = features.iloc[0:NumGenes,i]\r\n            train = train.iloc[:,feat_to_use]\r\n            test = test.iloc[:,feat_to_use]\r\n\r\n        start=tm.time()\r\n        Classifier.fit(train, y_train)\r\n        tr_time.append(tm.time()-start)\r\n\r\n        start=tm.time()\r\n        predicted = Classifier.predict(test)\r\n        prob = np.max(Classifier.predict_proba(test), axis = 1)\r\n        unlabeled = np.where(prob < Threshold)\r\n        predicted[unlabeled] = 'Unknown'\r\n        ts_time.append(tm.time()-start)\r\n\r\n        truelab.extend(y_test.values)\r\n        pred.extend(predicted)\r\n\r\n    truelab = pd.DataFrame(truelab)\r\n    pred = pd.DataFrame(pred)\r\n\r\n    tr_time = pd.DataFrame(tr_time)\r\n    ts_time = pd.DataFrame(ts_time)\r\n\r\n    OutputDir = Path(OutputDir)\r\n    truelab.to_csv(str(OutputDir / Path(\"LDA_rejection_true.csv\")),\r\n                   index = False)\r\n    pred.to_csv(str(OutputDir / Path(\"LDA_rejection_pred.csv\")),\r\n\r\n                index = False)\r\n\r\n    tr_time.to_csv(str(OutputDir / Path(\"LDA_rejection_training_time.csv\")),\r\n                   index = False)\r\n    ts_time.to_csv(str(OutputDir / Path(\"LDA_rejection_test_time.csv\")),\r\n                   index = False)\r\n\r\nrun_LDA(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))\r\n"
  },
  {
    "path": "Snakemake/Scripts/run_NMC.py",
    "content": "import os\r\nfrom sys import argv\r\nfrom pathlib import Path\r\n\r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nfrom sklearn.neighbors import NearestCentroid\r\nimport rpy2.robjects as robjects\r\n\r\n\r\ndef run_NMC(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0):\r\n    '''\r\n    run baseline classifier: NMC\r\n    Wrapper script to run a NMC classifier on a benchmark dataset with 5-fold cross validation,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n\r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes\r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection,\r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    '''\r\n\r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n    col = col - 1\r\n    test_ind = np.array(robjects.r['Test_Idx'])\r\n    train_ind = np.array(robjects.r['Train_Idx'])\r\n\r\n    # read the data\r\n    data = pd.read_csv(DataPath,index_col=0,sep=',')\r\n    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)\r\n\r\n    labels = labels.iloc[tokeep]\r\n    data = data.iloc[tokeep]\r\n\r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n\r\n    # normalize data\r\n    data = np.log1p(data)\r\n\r\n    Classifier = NearestCentroid()\r\n\r\n    tr_time=[]\r\n    ts_time=[]\r\n    truelab = []\r\n    pred = []\r\n\r\n    for i in range(np.squeeze(nfolds)):\r\n        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1\r\n        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1\r\n\r\n        train=data.iloc[train_ind_i]\r\n        test=data.iloc[test_ind_i]\r\n        y_train=labels.iloc[train_ind_i]\r\n        y_test=labels.iloc[test_ind_i]\r\n\r\n        if (NumGenes > 0):\r\n            feat_to_use = features.iloc[0:NumGenes,i]\r\n            train = train.iloc[:,feat_to_use]\r\n            test = test.iloc[:,feat_to_use]\r\n\r\n        start=tm.time()\r\n        Classifier.fit(train, y_train)\r\n        tr_time.append(tm.time()-start)\r\n\r\n        start=tm.time()\r\n        predicted = Classifier.predict(test)\r\n        ts_time.append(tm.time()-start)\r\n\r\n        truelab.extend(y_test.values)\r\n        pred.extend(predicted)\r\n\r\n    truelab = pd.DataFrame(truelab)\r\n    pred = pd.DataFrame(pred)\r\n\r\n    tr_time = pd.DataFrame(tr_time)\r\n    ts_time = pd.DataFrame(ts_time)\r\n\r\n    OutputDir = Path(OutputDir)\r\r\n    truelab.to_csv(str(OutputDir / Path(\"NMC_true.csv\")),\r\r\n                   index = False)\r\r\n    pred.to_csv(str(OutputDir / Path(\"NMC_pred.csv\")),\r\n    \r            index = False)\r\r\n    tr_time.to_csv(str(OutputDir / Path(\"NMC_training_time.csv\")),\r\n                   index = False)\r\n    ts_time.to_csv(str(OutputDir / Path(\"NMC_test_time.csv\")),\r\n                   index = False)\r\n\r\nrun_NMC(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))\r\n"
  },
  {
    "path": "Snakemake/Scripts/run_RF.py",
    "content": "import os\r\nfrom sys import argv\r\nfrom pathlib import Path\r\n\r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nfrom sklearn.ensemble import RandomForestClassifier\r\nimport rpy2.robjects as robjects\r\n\r\n\r\ndef run_RF(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0):\r\n    '''\r\n    run baseline classifier: RF\r\n    Wrapper script to run a RF classifier with 50 trees on a benchmark dataset with 5-fold cross validation,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n\r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes\r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection,\r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    '''\r\n\r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n    col = col - 1\r\n    test_ind = np.array(robjects.r['Test_Idx'])\r\n    train_ind = np.array(robjects.r['Train_Idx'])\r\n\r\n    # read the data\r\n    data = pd.read_csv(DataPath,index_col=0,sep=',')\r\n    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)\r\n\r\n    labels = labels.iloc[tokeep]\r\n    data = data.iloc[tokeep]\r\n\r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n\r\n    # normalize data\r\n    data = np.log1p(data)\r\n\r\n    Classifier = RandomForestClassifier(n_estimators = 50)\r\n\r\n    tr_time=[]\r\n    ts_time=[]\r\n    truelab = []\r\n    pred = []\r\n\r\n    for i in range(np.squeeze(nfolds)):\r\n        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1\r\n        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1\r\n\r\n        train=data.iloc[train_ind_i]\r\n        test=data.iloc[test_ind_i]\r\n        y_train=labels.iloc[train_ind_i]\r\n        y_test=labels.iloc[test_ind_i]\r\n\r\n        if (NumGenes > 0):\r\n            feat_to_use = features.iloc[0:NumGenes,i]\r\n            train = train.iloc[:,feat_to_use]\r\n            test = test.iloc[:,feat_to_use]\r\n\r\n        start=tm.time()\r\n        Classifier.fit(train, y_train)\r\n        tr_time.append(tm.time()-start)\r\n\r\n        start=tm.time()\r\n        predicted = Classifier.predict(test)\r\n        ts_time.append(tm.time()-start)\r\n\r\n        truelab.extend(y_test.values)\r\n        pred.extend(predicted)\r\n\r\n    truelab = pd.DataFrame(truelab)\r\n    pred = pd.DataFrame(pred)\r\n\r\n    tr_time = pd.DataFrame(tr_time)\r\n    ts_time = pd.DataFrame(ts_time)\r\n\r\n    OutputDir = Path(OutputDir)\r\r\n    truelab.to_csv(str(OutputDir / Path(\"RF_true.csv\")),\r\n                   index = False)\r\n    pred.to_csv(str(OutputDir / Path(\"RF_pred.csv\")),\r\r\n                index = False)\r\r\n    tr_time.to_csv(str(OutputDir / Path(\"RF_training_time.csv\")),\r\n                   index = False)\r\n    ts_time.to_csv(str(OutputDir / Path(\"RF_test_time.csv\")),\r\n                   index = False)\r\n\r\nrun_RF(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))\r\n"
  },
  {
    "path": "Snakemake/Scripts/run_SCINA.R",
    "content": "run_SCINA<-function(DataPath,LabelsPath,GeneSigPath,OutputDir){\r\n  \"\r\n  run SCINA\r\n  Wrapper script to run SCINA on a benchmark dataset,\r\n  outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n  Parameters\r\n  ----------\r\n  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n  as row names and gene names as column names.\r\n  LabelsPath : Cell population annotations file path (.csv).\r\n  GeneSigPath : Cell type marker genes file path (.csv)\r\n  OutputDir : Output directory defining the path of the exported file.\r\n  \"\r\n  \r\n  Data <- read.csv(DataPath,row.names = 1)\r\n  Labels <- as.vector(as.matrix(read.csv(LabelsPath)))\r\n  Data <- Data[is.element(Labels,c('CD14+ Monocyte','CD19+ B','CD56+ NK')),]\r\n  Labels <- Labels[is.element(Labels,c('CD14+ Monocyte','CD19+ B','CD56+ NK'))]\r\n  Labels[Labels == 'CD14+ Monocyte'] <- 'CD14_Monocyte'\r\n  Labels[Labels == 'CD19+ B'] <- 'CD19_B'\r\n  Labels[Labels == 'CD56+ NK'] <- 'CD56_NK'\r\n  \r\n  \r\n  #############################################################################\r\n  #                                 SCINA                                     #\r\n  #############################################################################\r\n  library(SCINA)\r\n  Signature_Genes <- preprocess.signatures(GeneSigPath)\r\n  True_Labels_SCINA <- list()\r\n  Pred_Labels_SCINA <- list()\r\n  Total_Time_SCINA <- list()\r\n  \r\n  library(preprocessCore)\r\n  Data = t(as.matrix(Data))\r\n  Data=log(Data+1)\r\n  Data[]=normalize.quantiles(Data)\r\n  \r\n  start_time <- Sys.time()\r\n  results = SCINA(Data, Signature_Genes)\r\n  end_time <- Sys.time()\r\n  \r\n  True_Labels_SCINA <- Labels\r\n  Pred_Labels_SCINA <- results$cell_labels\r\n  Total_Time_SCINA <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n  \r\n  setwd(OutputDir)\r\n  \r\n  write.csv(True_Labels_SCINA,'SCINA_True_Labels.csv',row.names = FALSE)\r\n  write.csv(Pred_Labels_SCINA,'SCINA_Pred_Labels.csv',row.names = FALSE)\r\n  write.csv(Total_Time_SCINA,'SCINA_Total_Time.csv',row.names = FALSE)\r\n}\r\n"
  },
  {
    "path": "Snakemake/Scripts/run_SVM.py",
    "content": "import os\r\nfrom sys import argv\r\nfrom pathlib import Path\r\n\r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nfrom sklearn.svm import LinearSVC\r\nimport rpy2.robjects as robjects\r\n\r\n\r\ndef run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0):\r\n    '''\r\n    run baseline classifier: SVM\r\n    Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n\r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes\r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection,\r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    '''\r\n\r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n    col = col - 1\r\n    test_ind = np.array(robjects.r['Test_Idx'])\r\n    train_ind = np.array(robjects.r['Train_Idx'])\r\n\r\n    # read the data\r\n    data = pd.read_csv(DataPath,index_col=0,sep=',')\r\n    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)\r\n\r\n    labels = labels.iloc[tokeep]\r\n    data = data.iloc[tokeep]\r\n\r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n\r\n    # normalize data\r\n    data = np.log1p(data)\r\n\r\n    Classifier = LinearSVC()\r\n\r\n    tr_time=[]\r\n    ts_time=[]\r\n    truelab = []\r\n    pred = []\r\n\r\n    for i in range(np.squeeze(nfolds)):\r\n        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1\r\n        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1\r\n\r\n        train=data.iloc[train_ind_i]\r\n        test=data.iloc[test_ind_i]\r\n        y_train=labels.iloc[train_ind_i]\r\n        y_test=labels.iloc[test_ind_i]\r\n\r\n        if (NumGenes > 0):\r\n            feat_to_use = features.iloc[0:NumGenes,i]\r\n            train = train.iloc[:,feat_to_use]\r\n            test = test.iloc[:,feat_to_use]\r\n\r\n        start=tm.time()\r\n        Classifier.fit(train, y_train)\r\n        tr_time.append(tm.time()-start)\r\n\r\n        start=tm.time()\r\n        predicted = Classifier.predict(test)\r\n        ts_time.append(tm.time()-start)\r\n\r\n        truelab.extend(y_test.values)\r\n        pred.extend(predicted)\r\n\r\n    truelab = pd.DataFrame(truelab)\r\n    pred = pd.DataFrame(pred)\r\n\r\n    tr_time = pd.DataFrame(tr_time)\r\n    ts_time = pd.DataFrame(ts_time)\r\n\r\n    OutputDir = Path(OutputDir)\r\n    truelab.to_csv(str(OutputDir / Path(\"SVM_true.csv\")),\r\n                   index = False)\r\n    pred.to_csv(str(OutputDir / Path(\"SVM_pred.csv\")),\r\n                index = False)\r\n    tr_time.to_csv(str(OutputDir / Path(\"SVM_training_time.csv\")),\r\r\n                   index = False)\r\n    ts_time.to_csv(str(OutputDir / Path(\"SVM_test_time.csv\")),\r\n                   index = False)\r\n\r\nrun_SVM(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))\r\n"
  },
  {
    "path": "Snakemake/Scripts/run_SVM_rejection.py",
    "content": "import os\r\nfrom sys import argv\r\nfrom pathlib import Path\r\n\r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nfrom sklearn.svm import LinearSVC\r\nimport rpy2.robjects as robjects\r\nfrom sklearn.calibration import CalibratedClassifierCV\r\n\r\n\r\ndef run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0, Threshold = 0.7):\r\n    '''\r\n    run baseline classifier: SVM\r\n    Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n\r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes\r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection,\r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    Threshold : Threshold used when rejecting the cells, default is 0.7.\r\n\r\n    '''\r\n\r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n    col = col - 1\r\n    test_ind = np.array(robjects.r['Test_Idx'])\r\n    train_ind = np.array(robjects.r['Train_Idx'])\r\n\r\n    # read the data\r\n    data = pd.read_csv(DataPath,index_col=0,sep=',')\r\n    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)\r\n\r\n    labels = labels.iloc[tokeep]\r\n    data = data.iloc[tokeep]\r\n\r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n\r\n    # normalize data\r\n    data = np.log1p(data)\r\n\r\n    Classifier = LinearSVC()\r\n    clf = CalibratedClassifierCV(Classifier)\r\n\r\n    tr_time=[]\r\n    ts_time=[]\r\n    truelab = []\r\n    pred = []\r\n\r\n    for i in range(np.squeeze(nfolds)):\r\n        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1\r\n        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1\r\n\r\n        train=data.iloc[train_ind_i]\r\n        test=data.iloc[test_ind_i]\r\n        y_train=labels.iloc[train_ind_i]\r\n        y_test=labels.iloc[test_ind_i]\r\n\r\n        if (NumGenes > 0):\r\n            feat_to_use = features.iloc[0:NumGenes,i]\r\n            train = train.iloc[:,feat_to_use]\r\n            test = test.iloc[:,feat_to_use]\r\n\r\n        start=tm.time()\r\n        clf.fit(train, y_train)\r\n        tr_time.append(tm.time()-start)\r\n\r\n        start=tm.time()\r\n        predicted = clf.predict(test)\r\n        prob = np.max(clf.predict_proba(test), axis = 1)\r\n        unlabeled = np.where(prob < Threshold)\r\n        predicted[unlabeled] = 'Unknown'\r\n        ts_time.append(tm.time()-start)\r\n\r\n        truelab.extend(y_test.values)\r\n        pred.extend(predicted)\r\n\r\n    truelab = pd.DataFrame(truelab)\r\n    pred = pd.DataFrame(pred)\r\n\r\n    tr_time = pd.DataFrame(tr_time)\r\n    ts_time = pd.DataFrame(ts_time)\r\n\r\n    OutputDir = Path(OutputDir)\r\n    truelab.to_csv(str(OutputDir / Path(\"SVM_rejection_true.csv\")),\r\n                   index = False)\r\n    pred.to_csv(str(OutputDir / Path(\"SVM_rejection_pred.csv\")),\r\n                index = False)\r\n    tr_time.to_csv(str(OutputDir / Path(\"SVM_rejection_training_time.csv\")),\r\n\r\n                   index = False)\r\n    ts_time.to_csv(str(OutputDir / Path(\"SVM_rejection_test_time.csv\")),\r\n                   index = False)\r\n\r\nrun_SVM(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))\r\n"
  },
  {
    "path": "Snakemake/Scripts/run_SingleR.R",
    "content": "args <- commandArgs(TRUE)\r\n\r\nrun_SingleR<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){\r\n  \"\r\n  run SingleR\r\n  Wrapper script to run SingleR on a benchmark dataset with 5-fold cross validation,\r\n  outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n\r\n  Parameters\r\n  ----------\r\n  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes\r\n  as row names and gene names as column names.\r\n  LabelsPath : Cell population annotations file path (.csv).\r\n  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n  OutputDir : Output directory defining the path of the exported file.\r\n  GeneOrderPath : Gene order file path (.csv) obtained from feature selection,\r\n  defining the genes order for each cross validation fold, default is NULL.\r\n  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.\r\n  \"\r\n\r\n  Data <- read.csv(DataPath,row.names = 1)\r\n  Labels <- as.matrix(read.csv(LabelsPath))\r\n  load(CV_RDataPath)\r\n  Labels <- as.vector(Labels[,col_Index])\r\n  Data <- Data[Cells_to_Keep,]\r\n  Labels <- Labels[Cells_to_Keep]\r\n  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n    GenesOrder = read.csv(GeneOrderPath)\r\n  }\r\n\r\n  #############################################################################\r\n  #                               SingleR                                     #\r\n  #############################################################################\r\n  library(SingleR)\r\n  library(Seurat)\r\n  True_Labels_SingleR <- list()\r\n  Pred_Labels_SingleR <- list()\r\n  Total_Time_SingleR <- list()\r\n  Data = t(as.matrix(Data))\r\n\r\n  for (i in c(1:n_folds)){\r\n    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n      start_time <- Sys.time()\r\n      singler = SingleR(method = \"single\", Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]],\r\n                        Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]],\r\n                        Labels[Train_Idx[[i]]], numCores = 1)\r\n      end_time <- Sys.time()\r\n    }\r\n    else{\r\n      start_time <- Sys.time()\r\n      singler = SingleR(method = \"single\", Data[,Test_Idx[[i]]], Data[,Train_Idx[[i]]], Labels[Train_Idx[[i]]], numCores = 1)\r\n      end_time <- Sys.time()\r\n    }\r\n    Total_Time_SingleR[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n\r\n    True_Labels_SingleR[i] <- list(Labels[Test_Idx[[i]]])\r\n    Pred_Labels_SingleR[i] <- list(as.vector(singler$labels))\r\n  }\r\n  True_Labels_SingleR <- as.vector(unlist(True_Labels_SingleR))\r\n  Pred_Labels_SingleR <- as.vector(unlist(Pred_Labels_SingleR))\r\n  Total_Time_SingleR <- as.vector(unlist(Total_Time_SingleR))\r\n\r\n  write.csv(True_Labels_SingleR,paste0(OutputDir,'/SingleR_true.csv'),row.names = FALSE)\r\n  write.csv(Pred_Labels_SingleR,paste0(OutputDir,'/SingleR_pred.csv'),row.names = FALSE)\r\n  write.csv(Total_Time_SingleR,paste0(OutputDir,'/SingleR_total_time.csv'),row.names = FALSE)\r\n}\r\n\r\nif (args[6] == \"0\") {\r\n  run_SingleR(args[1], args[2], args[3], args[4])\r\n} else {\r\n  run_SingleR(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6]))\r\n}\r\n"
  },
  {
    "path": "Snakemake/Scripts/run_kNN50.py",
    "content": "import os\r\nfrom sys import argv\r\nfrom pathlib import Path\r\n\r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nfrom sklearn.neighbors import KNeighborsClassifier\r\nimport rpy2.robjects as robjects\r\n\r\n\r\ndef run_kNN50(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0):\r\n    '''\r\n    run baseline classifiers: kNN\r\n    Wrapper script to run kNN (with k = 50) classifier on a benchmark dataset with 5-fold cross validation,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n\r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes\r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection,\r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    '''\r\n\r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n    col = col - 1\r\n    test_ind = np.array(robjects.r['Test_Idx'])\r\n    train_ind = np.array(robjects.r['Train_Idx'])\r\n\r\n    # read the data\r\n    data = pd.read_csv(DataPath,index_col=0,sep=',')\r\n    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)\r\n\r\n    labels = labels.iloc[tokeep]\r\n    data = data.iloc[tokeep]\r\n\r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n\r\n    # normalize data\r\n    data = np.log1p(data)\r\n\r\n    Classifier = KNeighborsClassifier(n_neighbors=50)\r\n\r\n    tr_time=[]\r\n    ts_time=[]\r\n    truelab = []\r\n    pred = []\r\n\r\n    for i in range(np.squeeze(nfolds)):\r\n        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1\r\n        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1\r\n\r\n        train=data.iloc[train_ind_i]\r\n        test=data.iloc[test_ind_i]\r\n        y_train=labels.iloc[train_ind_i]\r\n        y_test=labels.iloc[test_ind_i]\r\n\r\n        if (NumGenes > 0):\r\n            feat_to_use = features.iloc[0:NumGenes,i]\r\n            train = train.iloc[:,feat_to_use]\r\n            test = test.iloc[:,feat_to_use]\r\n\r\n        start=tm.time()\r\n        Classifier.fit(train, y_train)\r\n        tr_time.append(tm.time()-start)\r\n\r\n        start=tm.time()\r\n        predicted = Classifier.predict(test)\r\n        ts_time.append(tm.time()-start)\r\n\r\n        truelab.extend(y_test.values)\r\n        pred.extend(predicted)\r\n\r\n    truelab = pd.DataFrame(truelab)\r\n    pred = pd.DataFrame(pred)\r\n\r\n    tr_time = pd.DataFrame(tr_time)\r\n    ts_time = pd.DataFrame(ts_time)\r\n\r\n    OutputDir = Path(OutputDir)\r\n    truelab.to_csv(str(OutputDir / Path(\"kNN50_true.csv\")),\r\n                   index = False)\r\n    pred.to_csv(str(OutputDir / Path(\"kNN50_pred.csv\")),\r\n                index = False)\r\n    tr_time.to_csv(str(OutputDir / Path(\"kNN50_training_time.csv\")),\r\n                   index = False)\r\n    ts_time.to_csv(str(OutputDir / Path(\"kNN50_test_time.csv\")),\r\n                   index = False)\r\n\r\nrun_kNN50(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))\r\n"
  },
  {
    "path": "Snakemake/Scripts/run_kNN9.py",
    "content": "import os\r\nfrom sys import argv\r\nfrom pathlib import Path\r\n\r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nfrom sklearn.neighbors import KNeighborsClassifier\r\nimport rpy2.robjects as robjects\r\n\r\n\r\ndef run_kNN9(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0):\r\n    '''\r\n    run baseline classifiers: kNN\r\n    Wrapper script to run kNN (with k = 9) classifier on a benchmark dataset with 5-fold cross validation,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n\r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes\r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection,\r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    '''\r\n\r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n    col = col - 1\r\n    test_ind = np.array(robjects.r['Test_Idx'])\r\n    train_ind = np.array(robjects.r['Train_Idx'])\r\n\r\n    # read the data\r\n    data = pd.read_csv(DataPath,index_col=0,sep=',')\r\n    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)\r\n\r\n    labels = labels.iloc[tokeep]\r\n    data = data.iloc[tokeep]\r\n\r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n\r\n    # normalize data\r\n    data = np.log1p(data)\r\n\r\n    Classifier = KNeighborsClassifier(n_neighbors=9)\r\n\r\n    tr_time=[]\r\n    ts_time=[]\r\n    truelab = []\r\n    pred = []\r\n\r\n    for i in range(np.squeeze(nfolds)):\r\n        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1\r\n        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1\r\n\r\n        train=data.iloc[train_ind_i]\r\n        test=data.iloc[test_ind_i]\r\n        y_train=labels.iloc[train_ind_i]\r\n        y_test=labels.iloc[test_ind_i]\r\n\r\n        if (NumGenes > 0):\r\n            feat_to_use = features.iloc[0:NumGenes,i]\r\n            train = train.iloc[:,feat_to_use]\r\n            test = test.iloc[:,feat_to_use]\r\n\r\n        start=tm.time()\r\n        Classifier.fit(train, y_train)\r\n        tr_time.append(tm.time()-start)\r\n\r\n        start=tm.time()\r\n        predicted = Classifier.predict(test)\r\n        ts_time.append(tm.time()-start)\r\n\r\n        truelab.extend(y_test.values)\r\n        pred.extend(predicted)\r\n\r\n    truelab = pd.DataFrame(truelab)\r\n    pred = pd.DataFrame(pred)\r\n\r\n    tr_time = pd.DataFrame(tr_time)\r\n    ts_time = pd.DataFrame(ts_time)\r\n\r\n    OutputDir = Path(OutputDir)\r\n    truelab.to_csv(str(OutputDir / Path(\"kNN9_true.csv\")),\r\n                   index = False)\r\n    pred.to_csv(str(OutputDir / Path(\"kNN9_pred.csv\")),\r\n                index = False)\r\n    tr_time.to_csv(str(OutputDir / Path(\"kNN9_training_time.csv\")),\r\n                   index = False)\r\n    ts_time.to_csv(str(OutputDir / Path(\"kNN9_test_time.csv\")),\r\n                   index = False)\r\n\r\nrun_kNN50(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))\r\n"
  },
  {
    "path": "Snakemake/Scripts/run_moana.py",
    "content": "import os\r\nimport pandas as pd\r\nimport numpy as np\r\nfrom moana.core import ExpMatrix\r\nfrom moana.classify import CellTypeClassifier\r\nimport time as tm\r\nimport rpy2.robjects as robjects\r\n\r\ndef run_moana(DataPath, LabelsPath, ClassifierPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0):\r\n    '''\r\n    run moana\r\n    Wrapper script to run moana on a benchmark dataset with a pretrained classifier,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.  \r\n  \r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    ClassifierPath : Data file path to the pretrained classifier.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    '''\r\n    \r\n#    # read the Rdata file\r\n#    robjects.r['load'](CV_RDataPath)\r\n#\r\n#    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n#    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n#    col = col - 1\r\n    \r\n    matrix = ExpMatrix.read_tsv(DataPath, sep = ',')    \r\n#    matrix = matrix.iloc[tokeep] \r\n    \r\n    truelab = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',')\r\n#    truelab = truelab.iloc[tokeep]\r\n    \r\n    ct_old = ['CD19+ B','CD14+ Monocyte','CD4+/CD45RA+/CD25- Naive T','CD4+/CD45RO+ Memory','CD8+/CD45RA+ Naive Cytotoxic','Dendritic', 'CD56+ NK']\r\n    ct_new = ['B cells','CD14+ monocytes','Naive CD4+ T cells','Memory CD4+ T cells','Naive CD8+ T cells','Dendritic cells','NK cells']\r\n    \r\n    tokeep2 = np.isin(truelab,ct_old)\r\n    truelab = truelab[tokeep2]\r\n    print(len(truelab))\r\n    matrix = matrix.iloc[np.squeeze(tokeep2)]\r\n    \r\n    for i in range(len(ct_old)):\r\n        truelab.iloc[truelab == ct_old[i]] = ct_new[i]\r\n        \r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n        feat_to_use = features.iloc[0:NumGenes,0]\r\n        matrix = matrix.iloc[:,feat_to_use]\r\n\r\n    data = ExpMatrix(X = np.transpose(matrix.X), genes = matrix.cells, cells = matrix.genes)\r\n    data.genes.name = 'Genes'\r\n    data.cells.name = 'Cells'\r\n    data.index.name = 'Genes'\r\n    data.columns.name = 'Cells'\r\n    \r\n    clf = CellTypeClassifier.read_pickle(ClassifierPath)\r\n    \r\n    start = tm.time()\r\n    predictions = clf.predict(data)\r\n    runtime = tm.time() - start\r\n    \r\n    np.asarray(predictions)\r\n    \r\n    pred = pd.DataFrame(predictions)\r\n        \r\n    os.chdir(OutputDir)\r\n            \r\n    if (NumGenes == 0):  \r\n        truelab.to_csv(\"moana_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"moana_Pred_Labels.csv\", index = False)\r\n        with open(\"moana_Total_Time.csv\", 'w') as f:\r\n            f.write(\"%f\\n\" % runtime)\r\n    else:\r\n        truelab.to_csv(\"moana_\" + str(NumGenes) + \"_True_Labels.csv\", index = False)\r\n        pred.to_csv(\"moana_\" + str(NumGenes) + \"_Pred_Labels.csv\", index = False)\r\n        with open(\"moana_\" + str(NumGenes) + \"_Total_Time.csv\", 'w') as f:\r\n            f.write(\"%f\\n\" % runtime)\r\n\r\n\r\n        \r\n    \r\n    \r\n    \r\n    \r\n    \r\n    \r\n"
  },
  {
    "path": "Snakemake/Scripts/run_scID.R",
    "content": "args <- commandArgs(TRUE)\n\nrun_scID<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){\n  \"\n  run scID\n  Wrapper script to run scID on a benchmark dataset with 5-fold cross validation,\n  outputs lists of true and predicted cell labels as csv files, as well as computation time.\n\n  Parameters\n  ----------\n  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes\n  as row names and gene names as column names.\n  LabelsPath : Cell population annotations file path (.csv).\n  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\n  OutputDir : Output directory defining the path of the exported file.\n  GeneOrderPath : Gene order file path (.csv) obtained from feature selection,\n  defining the genes order for each cross validation fold, default is NULL.\n  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.\n  \"\n\n  Data <- read.csv(DataPath,row.names = 1)\n  Labels <- as.matrix(read.csv(LabelsPath))\n  load(CV_RDataPath)\n  Labels <- as.vector(Labels[,col_Index])\n  Data <- Data[Cells_to_Keep,]\n  Labels <- Labels[Cells_to_Keep]\n  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\n    GenesOrder = read.csv(GeneOrderPath)\n  }\n\n  #############################################################################\n  #                                 scID                                      #\n  #############################################################################\n  library(scID)\n  library(Seurat)\n  True_Labels_scID <- list()\n  Pred_Labels_scID <- list()\n  Total_Time_scID <- list()\n  Data = t(as.matrix(Data))\n\n  for (i in c(1:n_folds)){\n    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\n      Train_Labels <- list(Labels[Train_Idx[[i]]])\n      names(Train_Labels[[1]]) <- colnames(Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]])\n      start_time <- Sys.time()\n      scID_output <- scid_multiclass(Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]],\n                                     Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]],\n                                     Train_Labels[[1]])\n      end_time <- Sys.time()\n    }\n    else{\n      Train_Labels <- list(Labels[Train_Idx[[i]]])\n      names(Train_Labels[[1]]) <- colnames(Data[,Train_Idx[[i]]])\n      start_time <- Sys.time()\n      scID_output <- scid_multiclass(Data[,Test_Idx[[i]]], Data[,Train_Idx[[i]]], Train_Labels[[1]])\n      end_time <- Sys.time()\n    }\n    Total_Time_scID[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\n\n    True_Labels_scID[i] <- list(Labels[Test_Idx[[i]]])\n    Pred_Labels_scID[i] <- list(as.vector(scID_output$labels))\n  }\n  True_Labels_scID <- as.vector(unlist(True_Labels_scID))\n  Pred_Labels_scID <- as.vector(unlist(Pred_Labels_scID))\n  Total_Time_scID <- as.vector(unlist(Total_Time_scID))\n\n  write.csv(Pred_Labels_scID, paste0(OutputDir,'/scID_pred.csv'),row.names = FALSE)\n  write.csv(True_Labels_scID, paste0(OutputDir,'/scID_true.csv'),row.names = FALSE)\n  write.csv(Total_Time_scID,paste0(OutputDir,'/scID_total_time.csv'),row.names = FALSE)\n\n}\n\nif (args[6] == \"0\") {\n  run_scID(args[1], args[2], args[3], args[4])\n} else {\n  run_scID(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6]))\n}\n"
  },
  {
    "path": "Snakemake/Scripts/run_scPred.R",
    "content": "run_scPred<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){\r\n  \"\r\n  run scPred\r\n  Wrapper script to run scPred on a benchmark dataset with 5-fold cross validation,\r\n  outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n  Parameters\r\n  ----------\r\n  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n  as row names and gene names as column names.\r\n  LabelsPath : Cell population annotations file path (.csv).\r\n  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n  OutputDir : Output directory defining the path of the exported file.\r\n  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n  defining the genes order for each cross validation fold, default is NULL.\r\n  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.\r\n  \"\r\n  \r\n  Data <- read.csv(DataPath,row.names = 1)\r\n  Labels <- as.matrix(read.csv(LabelsPath))\r\n  load(CV_RDataPath)\r\n  Labels <- as.vector(Labels[,col_Index])\r\n  Data <- Data[Cells_to_Keep,]\r\n  Labels <- Labels[Cells_to_Keep]\r\n  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n    GenesOrder = read.csv(GeneOrderPath)\r\n  }\r\n  \r\n  #############################################################################\r\n  #                                scPred                                     #\r\n  #############################################################################\r\n  library(scPred)\r\n  library(tidyverse)\r\n  library(SingleCellExperiment)\r\n  True_Labels_scPred <- list()\r\n  Pred_Labels_scPred <- list()\r\n  Training_Time_scPred <- list()\r\n  Testing_Time_scPred <- list()\r\n  Data = t(as.matrix(Data))\r\n  \r\n  for (i in c(1:n_folds)){\r\n    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n      sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), \r\n                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))\r\n      sce_counts <- normcounts(sce)\r\n      sce_cpm <- apply(sce_counts, 2, function(x) (x/sum(x))*1000000)\r\n      sce_metadata <- as.data.frame(colData(sce))\r\n      \r\n      sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), \r\n                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))\r\n      sce_counts_test <- normcounts(sce_test)\r\n      sce_cpm_test <- apply(sce_counts_test, 2, function(x) (x/sum(x))*1000000)\r\n      sce_metadata_test <- as.data.frame(colData(sce_test))\r\n    }\r\n    else{\r\n      sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), \r\n                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))\r\n      sce_counts <- normcounts(sce)\r\n      sce_cpm <- apply(sce_counts, 2, function(x) (x/sum(x))*1000000)\r\n      sce_metadata <- as.data.frame(colData(sce))\r\n      \r\n      sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), \r\n                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))\r\n      sce_counts_test <- normcounts(sce_test)\r\n      sce_cpm_test <- apply(sce_counts_test, 2, function(x) (x/sum(x))*1000000)\r\n      sce_metadata_test <- as.data.frame(colData(sce_test))\r\n    }\r\n    \r\n    \r\n    # scPred Training    \r\n    start_time <- Sys.time()\r\n    set.seed(1234)\r\n    scp <- eigenDecompose(sce_cpm)\r\n    scPred::metadata(scp) <- sce_metadata\r\n    scp <- getFeatureSpace(scp, pVar = 'cell_type1')\r\n    # plotEigen(scp, group = 'cell_type1')\r\n    scp <- trainModel(scp)\r\n    # plotTrainProbs(scp)\r\n    end_time <- Sys.time()\r\n    Training_Time_scPred[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n    \r\n    # scPred Prediction\r\n    start_time <- Sys.time()\r\n    scp <- scPredict(scp,newData = sce_cpm_test)\r\n    end_time <- Sys.time()\r\n    Testing_Time_scPred[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n    \r\n    True_Labels_scPred[i] <- list(Labels[Test_Idx[[i]]])\r\n    Pred_Labels_scPred[i] <- list(getPredictions(scp)$predClass)\r\n  }\r\n  True_Labels_scPred <- as.vector(unlist(True_Labels_scPred))\r\n  Pred_Labels_scPred <- as.vector(unlist(Pred_Labels_scPred))\r\n  Training_Time_scPred <- as.vector(unlist(Training_Time_scPred))\r\n  Testing_Time_scPred <- as.vector(unlist(Testing_Time_scPred))\r\n  \r\n  setwd(OutputDir)\r\n  \r\n  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n    write.csv(True_Labels_scPred,paste('scPred_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Pred_Labels_scPred,paste('scPred_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Training_Time_scPred,paste('scPred_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Testing_Time_scPred,paste('scPred_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)\r\n  }\r\n  else{\r\n    write.csv(True_Labels_scPred,'scPred_True_Labels.csv',row.names = FALSE)\r\n    write.csv(Pred_Labels_scPred,'scPred_Pred_Labels.csv',row.names = FALSE)\r\n    write.csv(Training_Time_scPred,'scPred_Training_Time.csv',row.names = FALSE)\r\n    write.csv(Testing_Time_scPred,'scPred_Testing_Time.csv',row.names = FALSE)\r\n  }\r\n}\r\n"
  },
  {
    "path": "Snakemake/Scripts/run_scVI.py",
    "content": "from scvi.dataset import CsvDataset\r\nimport os\r\nfrom sys import argv\r\nfrom pathlib import Path\r\nfrom scvi.dataset import CsvDataset\r\nimport numpy as np\r\nimport pandas as pd\r\nfrom scvi.models import SCANVI\r\nfrom scvi.inference import SemiSupervisedTrainer\r\nimport time as tm\r\nimport rpy2.robjects as robjects\r\n\r\ndef run_scVI(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = \"\", NumGenes = 0):\r\n    '''\r\n    run scVI\r\n    Wrapper script to run scVI on a benchmark dataset with 5-fold cross validation,\r\n    outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n    as row names and gene names as column names.\r\n    LabelsPath : Cell population annotations file path (.csv).\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n    defining the genes order for each cross validation fold, default is NULL.\r\n    NumGenes : Number of genes used in case of feature selection (integer), default is 0.\r\n    '''\r\n    \r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    col = np.array(robjects.r['col_Index'], dtype = 'int')\r\n    col = col - 1 \r\n    test_ind = np.array(robjects.r['Test_Idx'])\r\n    train_ind = np.array(robjects.r['Train_Idx'])\r\n\r\n    # read the data\r\n    data = pd.read_csv(DataPath,index_col=0,sep=',')\r\n    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)\r\n\r\n    labels = labels.iloc[tokeep]\r\n    data = data.iloc[tokeep] \r\n    \r\n    # read the feature file\r\n    if (NumGenes > 0):\r\n        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')\r\n    \r\n    if (NumGenes == 0):\r\n        #save labels as csv file with header and index column\r\n        labels.to_csv('Labels_scvi.csv')\r\n        data.to_csv('Data_scvi.csv')    \r\n        \r\n        train = CsvDataset('Data_scvi.csv', save_path = \"\", sep = \",\", labels_file = \"Labels_scvi.csv\", gene_by_cell = False)\r\n        \r\n        ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing\r\n        scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels)\r\n        trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5)\r\n    \r\n    n_epochs = 200\r\n    \r\n    truelab = []\r\n    pred = []\r\n    tr_time = []\r\n    ts_time = []\r\n    \r\n    for i in range(np.squeeze(nfolds)):\r\n        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1\r\n        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1\r\n        \r\n        if (NumGenes > 0):\r\n            feat_to_use = features.iloc[0:NumGenes,i]\r\n            data2 = data.iloc[:,feat_to_use]\r\n\r\n            labels.to_csv(OutputDir +'/Labels_scvi.csv')\r\n            data2.to_csv(OutputDir +'/Data_scvi.csv')\r\n\r\n            train = CsvDataset(OutputDir +'/Data_scvi.csv', save_path = \"\", sep = \",\", labels_file = OutputDir +\"/Labels_scvi.csv\", gene_by_cell = False, new_n_genes = False)\r\n\r\n            ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing\r\n            scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels)\r\n            trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5)\r\n\r\n        trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(indices=(train_ind_i).ravel(), shuffle = False)\r\n        trainer_scanvi.labelled_set.to_monitor = ['ll','accuracy']\r\n        trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(indices=(test_ind_i).ravel(), shuffle = False)\r\n        trainer_scanvi.unlabelled_set.to_monitor = ['ll','accuracy']\r\n    \r\n        start = tm.time()\r\n        trainer_scanvi.train(n_epochs)\r\n        tr_time.append(tm.time()-start)\r\n    \r\n        ## labels of test set are in y_pred\r\n        ## labels are returned in numbers, should be mapped back to the real labels\r\n        ## indices are permutated\r\n        start = tm.time()\r\n        y_true, y_pred = trainer_scanvi.unlabelled_set.compute_predictions()\r\n        ts_time.append(tm.time()-start)\r\n        \r\n        truelab.extend(y_true)\r\n        pred.extend(y_pred)\r\n    \r\n    #write results\r\n\r\n    truelab = pd.DataFrame(truelab)\r\n    pred = pd.DataFrame(pred)\r\n    \r\n    tr_time = pd.DataFrame(tr_time)\r\n    ts_time = pd.DataFrame(ts_time)\r\n\r\n    truelab.to_csv(str(Path(OutputDir + \"/scVI_true.csv\")), index=False)\r\n    pred.to_csv(str(Path(OutputDir + \"/scVI_pred.csv\")), index=False)\r\n    tr_time.to_csv(str(Path(OutputDir + \"/scVI_training_time.csv\")), index=False)\r\n    ts_time.to_csv(str(Path(OutputDir + \"/scVI_test_time.csv\")), index=False)\r\n\r\nrun_scVI(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))\r\n"
  },
  {
    "path": "Snakemake/Scripts/run_scmap.R",
    "content": "run_scmap <- function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){\r\n  \"\r\n  run scmap\r\n  Wrapper script to run scmap on a benchmark dataset with 5-fold cross validation,\r\n  outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n  Parameters\r\n  ----------\r\n  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n  as row names and gene names as column names.\r\n  LabelsPath : Cell population annotations file path (.csv).\r\n  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n  OutputDir : Output directory defining the path of the exported file.\r\n  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n  defining the genes order for each cross validation fold, default is NULL.\r\n  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.\r\n  \"\r\n  \r\n  Data <- read.csv(DataPath,row.names = 1)\r\n  Labels <- as.matrix(read.csv(LabelsPath))\r\n  load(CV_RDataPath)\r\n  Labels <- as.vector(Labels[,col_Index])\r\n  Data <- Data[Cells_to_Keep,]\r\n  Labels <- Labels[Cells_to_Keep]\r\n  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n    GenesOrder = read.csv(GeneOrderPath)\r\n  }\r\n  \r\n  #############################################################################\r\n  #                                 scmap                                     #\r\n  #############################################################################\r\n  library(scmap)\r\n  library(SingleCellExperiment)\r\n  True_Labels_scmapcluster <- list()\r\n  Pred_Labels_scmapcluster <- list()\r\n  True_Labels_scmapcell <- list()\r\n  Pred_Labels_scmapcell <- list()\r\n  Training_Time_scmapcluster <- list()\r\n  Testing_Time_scmapcluster <- list()\r\n  Training_Time_scmapcell <- list()\r\n  Testing_Time_scmapcell <- list()\r\n  Data = t(as.matrix(Data))\r\n  \r\n  for (i in c(1:n_folds)){\r\n    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n      sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), \r\n                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))\r\n      logcounts(sce) <- log2(normcounts(sce) + 1)\r\n      # use gene names as feature symbols\r\n      rowData(sce)$feature_symbol <- rownames(sce)\r\n      sce <- selectFeatures(sce, n_features = NumGenes, suppress_plot = TRUE)\r\n      \r\n      sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), \r\n                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))\r\n      logcounts(sce_test) <- log2(normcounts(sce_test) + 1)\r\n      rowData(sce_test)$feature_symbol <- rownames(sce_test)\r\n      sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData\r\n    }\r\n    else{\r\n      sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), \r\n                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))\r\n      logcounts(sce) <- log2(normcounts(sce) + 1)\r\n      # use gene names as feature symbols\r\n      rowData(sce)$feature_symbol <- rownames(sce)\r\n      sce <- selectFeatures(sce, suppress_plot = TRUE)\r\n      \r\n      sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), \r\n                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))\r\n      logcounts(sce_test) <- log2(normcounts(sce_test) + 1)\r\n      rowData(sce_test)$feature_symbol <- rownames(sce_test)\r\n      sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData\r\n    }\r\n    \r\n    # scmap-cluster\r\n    start_time <- Sys.time()\r\n    sce <- indexCluster(sce)\r\n    end_time <- Sys.time()\r\n    Training_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n    \r\n    start_time <- Sys.time()\r\n    scmapCluster_results <- scmapCluster(projection = sce_test,index_list = list(metadata(sce)$scmap_cluster_index))\r\n    end_time <- Sys.time()\r\n    Testing_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n    \r\n    True_Labels_scmapcluster[i] <- list(Labels[Test_Idx[[i]]])\r\n    Pred_Labels_scmapcluster[i] <- list(scmapCluster_results$combined_labs)\r\n    \r\n    # scmap-cell\r\n    start_time <- Sys.time()\r\n    set.seed(1)\r\n    sce <- indexCell(sce)\r\n    end_time <- Sys.time()\r\n    Training_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n    \r\n    start_time <- Sys.time()\r\n    scmapCell_results <- scmapCell(sce_test,list(metadata(sce)$scmap_cell_index))\r\n    scmapCell_clusters <- scmapCell2Cluster(scmapCell_results,list(as.character(colData(sce)$cell_type1)))\r\n    end_time <- Sys.time()\r\n    Testing_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n    \r\n    True_Labels_scmapcell[i] <- list(Labels[Test_Idx[[i]]])\r\n    Pred_Labels_scmapcell[i] <- list(scmapCell_clusters$combined_labs)\r\n  }\r\n  \r\n  True_Labels_scmapcluster <- as.vector(unlist(True_Labels_scmapcluster))\r\n  Pred_Labels_scmapcluster <- as.vector(unlist(Pred_Labels_scmapcluster))\r\n  True_Labels_scmapcell <- as.vector(unlist(True_Labels_scmapcell))\r\n  Pred_Labels_scmapcell <- as.vector(unlist(Pred_Labels_scmapcell))\r\n  Training_Time_scmapcluster <- as.vector(unlist(Training_Time_scmapcluster))\r\n  Testing_Time_scmapcluster <- as.vector(unlist(Testing_Time_scmapcluster))\r\n  Training_Time_scmapcell <- as.vector(unlist(Training_Time_scmapcell))\r\n  Testing_Time_scmapcell <- as.vector(unlist(Testing_Time_scmapcell))\r\n  \r\n  setwd(OutputDir)\r\n  \r\n  if (!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n    write.csv(True_Labels_scmapcluster,paste('scmapcluster_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Pred_Labels_scmapcluster,paste('scmapcluster_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)\r\n    write.csv(True_Labels_scmapcell,paste('scmapcell_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Pred_Labels_scmapcell,paste('scmapcell_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Training_Time_scmapcluster,paste('scmapcluster_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Testing_Time_scmapcluster,paste('scmapcluster_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Training_Time_scmapcell,paste('scmapcell_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Testing_Time_scmapcell,paste('scmapcell_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)\r\n  }\r\n  else{\r\n    write.csv(True_Labels_scmapcluster,'scmapcluster_True_Labels.csv',row.names = FALSE)\r\n    write.csv(Pred_Labels_scmapcluster,'scmapcluster_Pred_Labels.csv',row.names = FALSE)\r\n    write.csv(True_Labels_scmapcell,'scmapcell_True_Labels.csv',row.names = FALSE)\r\n    write.csv(Pred_Labels_scmapcell,'scmapcell_Pred_Labels.csv',row.names = FALSE)\r\n    write.csv(Training_Time_scmapcluster,'scmapcluster_Training_Time.csv',row.names = FALSE)\r\n    write.csv(Testing_Time_scmapcluster,'scmapcluster_Testing_Time.csv',row.names = FALSE)\r\n    write.csv(Training_Time_scmapcell,'scmapcell_Training_Time.csv',row.names = FALSE)\r\n    write.csv(Testing_Time_scmapcell,'scmapcell_Testing_Time.csv',row.names = FALSE)\r\n  }\r\n}\r\n"
  },
  {
    "path": "Snakemake/Scripts/run_scmapcell.R",
    "content": "args <- commandArgs(TRUE)\n\nrun_scmapcell <- function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){\n  \"\n  run scmapcell\n  Wrapper script to run scmap on a benchmark dataset with 5-fold cross validation,\n  outputs lists of true and predicted cell labels as csv files, as well as computation time.\n  \n  Parameters\n  ----------\n  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \n  as row names and gene names as column names.\n  LabelsPath : Cell population annotations file path (.csv).\n  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\n  OutputDir : Output directory defining the path of the exported file.\n  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \n  defining the genes order for each cross validation fold, default is NULL.\n  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.\n  \"\n  \n  Data <- read.csv(DataPath,row.names = 1)\n  Labels <- as.matrix(read.csv(LabelsPath))\n  load(CV_RDataPath)\n  Labels <- as.vector(Labels[,col_Index])\n  Data <- Data[Cells_to_Keep,]\n  Labels <- Labels[Cells_to_Keep]\n  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\n    GenesOrder = read.csv(GeneOrderPath)\n  }\n  \n  #############################################################################\n  #                                 scmap                                     #\n  #############################################################################\n  library(scmap)\n  library(SingleCellExperiment)\n  True_Labels_scmapcell <- list()\n  Pred_Labels_scmapcell <- list()\n  Training_Time_scmapcell <- list()\n  Testing_Time_scmapcell <- list()\n  Data = t(as.matrix(Data))\n  \n  for (i in c(1:n_folds)){\n    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\n      sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), \n                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))\n      logcounts(sce) <- log2(normcounts(sce) + 1)\n      # use gene names as feature symbols\n      rowData(sce)$feature_symbol <- rownames(sce)\n      sce <- selectFeatures(sce, n_features = NumGenes, suppress_plot = TRUE)\n      \n      sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), \n                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))\n      logcounts(sce_test) <- log2(normcounts(sce_test) + 1)\n      rowData(sce_test)$feature_symbol <- rownames(sce_test)\n      sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData\n    }\n    else{\n      sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), \n                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))\n      logcounts(sce) <- log2(normcounts(sce) + 1)\n      # use gene names as feature symbols\n      rowData(sce)$feature_symbol <- rownames(sce)\n      sce <- selectFeatures(sce, suppress_plot = TRUE)\n      \n      sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), \n                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))\n      logcounts(sce_test) <- log2(normcounts(sce_test) + 1)\n      rowData(sce_test)$feature_symbol <- rownames(sce_test)\n      sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData\n    }\n    \n    # scmap-cell\n    start_time <- Sys.time()\n    set.seed(1)\n    sce <- indexCell(sce)\n    end_time <- Sys.time()\n    Training_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\n    \n    start_time <- Sys.time()\n    scmapCell_results <- scmapCell(sce_test,list(metadata(sce)$scmap_cell_index))\n    scmapCell_clusters <- scmapCell2Cluster(scmapCell_results,list(as.character(colData(sce)$cell_type1)))\n    end_time <- Sys.time()\n    Testing_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\n    \n    True_Labels_scmapcell[i] <- list(Labels[Test_Idx[[i]]])\n    Pred_Labels_scmapcell[i] <- list(scmapCell_clusters$combined_labs)\n  }\n  \n  True_Labels_scmapcell <- as.vector(unlist(True_Labels_scmapcell))\n  Pred_Labels_scmapcell <- as.vector(unlist(Pred_Labels_scmapcell))\n  Training_Time_scmapcell <- as.vector(unlist(Training_Time_scmapcell))\n  Testing_Time_scmapcell <- as.vector(unlist(Testing_Time_scmapcell))\n  \n  write.csv(True_Labels_scmapcell,paste0(OutputDir,'/scmapcell_true.csv'),row.names = FALSE)\n  write.csv(Pred_Labels_scmapcell,paste0(OutputDir,'/scmapcell_pred.csv'),row.names = FALSE)\n  write.csv(Training_Time_scmapcell,paste0(OutputDir,'/scmapcell_training_time.csv'),row.names = FALSE)\n  write.csv(Testing_Time_scmapcell,paste0(OutputDir,'/scmapcell_test_time.csv'),row.names = FALSE)\n}\nif (args[6] == \"0\") {\n  run_scmapcell(args[1], args[2], args[3], args[4])\n} else {\n  run_scmapcell(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6]))\n}\n\n\n"
  },
  {
    "path": "Snakemake/Scripts/run_scmapcluster.R",
    "content": "args <- commandArgs(TRUE)\n\nrun_scmapcluster <- function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){\n  \"\n  run scmapcluster\n  Wrapper script to run scmap on a benchmark dataset with 5-fold cross validation,\n  outputs lists of true and predicted cell labels as csv files, as well as computation time.\n  \n  Parameters\n  ----------\n  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \n  as row names and gene names as column names.\n  LabelsPath : Cell population annotations file path (.csv).\n  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\n  OutputDir : Output directory defining the path of the exported file.\n  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \n  defining the genes order for each cross validation fold, default is NULL.\n  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.\n  \"\n  \n  Data <- read.csv(DataPath,row.names = 1)\n  Labels <- as.matrix(read.csv(LabelsPath))\n  load(CV_RDataPath)\n  Labels <- as.vector(Labels[,col_Index])\n  Data <- Data[Cells_to_Keep,]\n  Labels <- Labels[Cells_to_Keep]\n  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\n    GenesOrder = read.csv(GeneOrderPath)\n  }\n  \n  #############################################################################\n  #                                 scmap                                     #\n  #############################################################################\n  library(scmap)\n  library(SingleCellExperiment)\n  True_Labels_scmapcluster <- list()\n  Pred_Labels_scmapcluster <- list()\n  Training_Time_scmapcluster <- list()\n  Testing_Time_scmapcluster <- list()\n  Data = t(as.matrix(Data))\n  \n  for (i in c(1:n_folds)){\n    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\n      sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), \n                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))\n      logcounts(sce) <- log2(normcounts(sce) + 1)\n      # use gene names as feature symbols\n      rowData(sce)$feature_symbol <- rownames(sce)\n      sce <- selectFeatures(sce, n_features = NumGenes, suppress_plot = TRUE)\n      \n      sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), \n                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))\n      logcounts(sce_test) <- log2(normcounts(sce_test) + 1)\n      rowData(sce_test)$feature_symbol <- rownames(sce_test)\n      sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData\n    }\n    else{\n      sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), \n                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))\n      logcounts(sce) <- log2(normcounts(sce) + 1)\n      # use gene names as feature symbols\n      rowData(sce)$feature_symbol <- rownames(sce)\n      sce <- selectFeatures(sce, suppress_plot = TRUE)\n      \n      sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), \n                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))\n      logcounts(sce_test) <- log2(normcounts(sce_test) + 1)\n      rowData(sce_test)$feature_symbol <- rownames(sce_test)\n      sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData\n    }\n    \n    # scmap-cluster\n    start_time <- Sys.time()\n    sce <- indexCluster(sce)\n    end_time <- Sys.time()\n    Training_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\n    \n    start_time <- Sys.time()\n    scmapCluster_results <- scmapCluster(projection = sce_test,index_list = list(metadata(sce)$scmap_cluster_index))\n    end_time <- Sys.time()\n    Testing_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\n    \n    True_Labels_scmapcluster[i] <- list(Labels[Test_Idx[[i]]])\n    Pred_Labels_scmapcluster[i] <- list(scmapCluster_results$combined_labs)\n    \n  }\n  \n  True_Labels_scmapcluster <- as.vector(unlist(True_Labels_scmapcluster))\n  Pred_Labels_scmapcluster <- as.vector(unlist(Pred_Labels_scmapcluster))\n  Training_Time_scmapcluster <- as.vector(unlist(Training_Time_scmapcluster))\n  Testing_Time_scmapcluster <- as.vector(unlist(Testing_Time_scmapcluster))\n\n  write.csv(True_Labels_scmapcluster,paste0(OutputDir,'/scmapcluster_true.csv'),row.names = FALSE)\n  write.csv(Pred_Labels_scmapcluster,paste0(OutputDir,'/scmapcluster_pred.csv'),row.names = FALSE)\n  write.csv(Training_Time_scmapcluster,paste0(OutputDir,'/scmapcluster_training_time.csv'),row.names = FALSE)\n  write.csv(Testing_Time_scmapcluster,paste0(OutputDir,'/scmapcluster_test_time.csv'),row.names = FALSE)\n\n\n}\nif (args[6] == \"0\") {\n  run_scmapcluster(args[1], args[2], args[3], args[4])\n} else {\n  run_scmapcluster(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6]))\n}\n"
  },
  {
    "path": "Snakemake/Scripts/run_scmaptotal.R",
    "content": "run_scmap <- function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){\r\n  \"\r\n  run scmap\r\n  Wrapper script to run scmap on a benchmark dataset with 5-fold cross validation,\r\n  outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n  \r\n  Parameters\r\n  ----------\r\n  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n  as row names and gene names as column names.\r\n  LabelsPath : Cell population annotations file path (.csv).\r\n  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n  OutputDir : Output directory defining the path of the exported file.\r\n  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, \r\n  defining the genes order for each cross validation fold, default is NULL.\r\n  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.\r\n  \"\r\n  \r\n  Data <- read.csv(DataPath,row.names = 1)\r\n  Labels <- as.matrix(read.csv(LabelsPath))\r\n  load(CV_RDataPath)\r\n  Labels <- as.vector(Labels[,col_Index])\r\n  Data <- Data[Cells_to_Keep,]\r\n  Labels <- Labels[Cells_to_Keep]\r\n  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n    GenesOrder = read.csv(GeneOrderPath)\r\n  }\r\n  \r\n  #############################################################################\r\n  #                                 scmap                                     #\r\n  #############################################################################\r\n  library(scmap)\r\n  library(SingleCellExperiment)\r\n  True_Labels_scmapcluster <- list()\r\n  Pred_Labels_scmapcluster <- list()\r\n  True_Labels_scmapcell <- list()\r\n  Pred_Labels_scmapcell <- list()\r\n  Training_Time_scmapcluster <- list()\r\n  Testing_Time_scmapcluster <- list()\r\n  Training_Time_scmapcell <- list()\r\n  Testing_Time_scmapcell <- list()\r\n  Data = t(as.matrix(Data))\r\n  \r\n  for (i in c(1:n_folds)){\r\n    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n      sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), \r\n                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))\r\n      logcounts(sce) <- log2(normcounts(sce) + 1)\r\n      # use gene names as feature symbols\r\n      rowData(sce)$feature_symbol <- rownames(sce)\r\n      sce <- selectFeatures(sce, n_features = NumGenes, suppress_plot = TRUE)\r\n      \r\n      sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), \r\n                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))\r\n      logcounts(sce_test) <- log2(normcounts(sce_test) + 1)\r\n      rowData(sce_test)$feature_symbol <- rownames(sce_test)\r\n      sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData\r\n    }\r\n    else{\r\n      sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), \r\n                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))\r\n      logcounts(sce) <- log2(normcounts(sce) + 1)\r\n      # use gene names as feature symbols\r\n      rowData(sce)$feature_symbol <- rownames(sce)\r\n      sce <- selectFeatures(sce, suppress_plot = TRUE)\r\n      \r\n      sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), \r\n                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))\r\n      logcounts(sce_test) <- log2(normcounts(sce_test) + 1)\r\n      rowData(sce_test)$feature_symbol <- rownames(sce_test)\r\n      sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData\r\n    }\r\n    \r\n    # scmap-cluster\r\n    start_time <- Sys.time()\r\n    sce <- indexCluster(sce)\r\n    end_time <- Sys.time()\r\n    Training_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n    \r\n    start_time <- Sys.time()\r\n    scmapCluster_results <- scmapCluster(projection = sce_test,index_list = list(metadata(sce)$scmap_cluster_index))\r\n    end_time <- Sys.time()\r\n    Testing_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n    \r\n    True_Labels_scmapcluster[i] <- list(Labels[Test_Idx[[i]]])\r\n    Pred_Labels_scmapcluster[i] <- list(scmapCluster_results$combined_labs)\r\n    \r\n    # scmap-cell\r\n    start_time <- Sys.time()\r\n    set.seed(1)\r\n    sce <- indexCell(sce)\r\n    end_time <- Sys.time()\r\n    Training_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n    \r\n    start_time <- Sys.time()\r\n    scmapCell_results <- scmapCell(sce_test,list(metadata(sce)$scmap_cell_index))\r\n    scmapCell_clusters <- scmapCell2Cluster(scmapCell_results,list(as.character(colData(sce)$cell_type1)))\r\n    end_time <- Sys.time()\r\n    Testing_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n    \r\n    True_Labels_scmapcell[i] <- list(Labels[Test_Idx[[i]]])\r\n    Pred_Labels_scmapcell[i] <- list(scmapCell_clusters$combined_labs)\r\n  }\r\n  \r\n  True_Labels_scmapcluster <- as.vector(unlist(True_Labels_scmapcluster))\r\n  Pred_Labels_scmapcluster <- as.vector(unlist(Pred_Labels_scmapcluster))\r\n  True_Labels_scmapcell <- as.vector(unlist(True_Labels_scmapcell))\r\n  Pred_Labels_scmapcell <- as.vector(unlist(Pred_Labels_scmapcell))\r\n  Training_Time_scmapcluster <- as.vector(unlist(Training_Time_scmapcluster))\r\n  Testing_Time_scmapcluster <- as.vector(unlist(Testing_Time_scmapcluster))\r\n  Training_Time_scmapcell <- as.vector(unlist(Training_Time_scmapcell))\r\n  Testing_Time_scmapcell <- as.vector(unlist(Testing_Time_scmapcell))\r\n  \r\n  setwd(OutputDir)\r\n  \r\n  if (!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n    write.csv(True_Labels_scmapcluster,paste('scmapcluster_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Pred_Labels_scmapcluster,paste('scmapcluster_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)\r\n    write.csv(True_Labels_scmapcell,paste('scmapcell_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Pred_Labels_scmapcell,paste('scmapcell_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Training_Time_scmapcluster,paste('scmapcluster_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Testing_Time_scmapcluster,paste('scmapcluster_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Training_Time_scmapcell,paste('scmapcell_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)\r\n    write.csv(Testing_Time_scmapcell,paste('scmapcell_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)\r\n  }\r\n  else{\r\n    write.csv(True_Labels_scmapcluster,'scmapcluster_True_Labels.csv',row.names = FALSE)\r\n    write.csv(Pred_Labels_scmapcluster,'scmapcluster_Pred_Labels.csv',row.names = FALSE)\r\n    write.csv(True_Labels_scmapcell,'scmapcell_True_Labels.csv',row.names = FALSE)\r\n    write.csv(Pred_Labels_scmapcell,'scmapcell_Pred_Labels.csv',row.names = FALSE)\r\n    write.csv(Training_Time_scmapcluster,'scmapcluster_Training_Time.csv',row.names = FALSE)\r\n    write.csv(Testing_Time_scmapcluster,'scmapcluster_Testing_Time.csv',row.names = FALSE)\r\n    write.csv(Training_Time_scmapcell,'scmapcell_Training_Time.csv',row.names = FALSE)\r\n    write.csv(Testing_Time_scmapcell,'scmapcell_Testing_Time.csv',row.names = FALSE)\r\n  }\r\n}\r\n"
  },
  {
    "path": "Snakemake/Scripts/run_singleCellNet.R",
    "content": "args <- commandArgs(TRUE)\r\n\r\nrun_singleCellNet<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){\r\n  \"\r\n  run singleCellNet\r\n  Wrapper script to run singleCellNet on a benchmark dataset with 5-fold cross validation,\r\n  outputs lists of true and predicted cell labels as csv files, as well as computation time.\r\n\r\n  Parameters\r\n  ----------\r\n  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes\r\n  as row names and gene names as column names.\r\n  LabelsPath : Cell population annotations file path (.csv).\r\n  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n  OutputDir : Output directory defining the path of the exported file.\r\n  GeneOrderPath : Gene order file path (.csv) obtained from feature selection,\r\n  defining the genes order for each cross validation fold, default is NULL.\r\n  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.\r\n  \"\r\n\r\n  Data <- read.csv(DataPath,row.names = 1)\r\n  colnames(Data) <- gsub('_','.',colnames(Data), fixed = TRUE)\r\n  Labels <- as.matrix(read.csv(LabelsPath))\r\n  load(CV_RDataPath)\r\n  Labels <- as.vector(Labels[,col_Index])\r\n  Data <- Data[Cells_to_Keep,]\r\n  Labels <- Labels[Cells_to_Keep]\r\n  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n    GenesOrder = read.csv(GeneOrderPath)\r\n  }\r\n\r\n  #############################################################################\r\n  #                              singleCellNet                                #\r\n  #############################################################################\r\n  library(singleCellNet)\r\n  library(dplyr)\r\n  True_Labels_singleCellNet <- list()\r\n  Pred_Labels_singleCellNet <- list()\r\n  Training_Time_singleCellNet <- list()\r\n  Testing_Time_singleCellNet <- list()\r\n  Data = t(as.matrix(Data))              # deals also with sparse matrix\r\n\r\n  for(i in c(1:n_folds)){\r\n    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){\r\n      DataTrain <- Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]\r\n      DataTest <- Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]\r\n    }\r\n    else{\r\n      DataTrain <- Data[,Train_Idx[[i]]]\r\n      DataTest <- Data[,Test_Idx[[i]]]\r\n    }\r\n\r\n    start_time <- Sys.time()\r\n    cgenes2<-findClassyGenes(DataTrain, data.frame(Annotation = Labels[Train_Idx[[i]]]), \"Annotation\")\r\n    cgenesA<-cgenes2[['cgenes']]\r\n    grps<-cgenes2[['grps']]\r\n    DataTrain<-as.matrix(DataTrain[cgenesA,])\r\n    xpairs<-ptGetTop(DataTrain, grps, ncores = 1)\r\n    pdTrain<-query_transform(DataTrain[cgenesA, ], xpairs)\r\n    rf<-sc_makeClassifier(pdTrain[xpairs,], genes=xpairs, groups=grps)\r\n    end_time <- Sys.time()\r\n    Training_Time_singleCellNet[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n\r\n    start_time <- Sys.time()\r\n    DataTest<-query_transform(DataTest[cgenesA,], xpairs)\r\n    classRes <-rf_classPredict(rf, DataTest)\r\n    end_time <- Sys.time()\r\n    Testing_Time_singleCellNet[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))\r\n\r\n    True_Labels_singleCellNet[i] <- list(Labels[Test_Idx[[i]]])\r\n    Pred_Labels_singleCellNet[i] <- list((rownames(classRes)[apply(classRes,2,which.max)])[1:length(Test_Idx[[i]])])\r\n  }\r\n  True_Labels_singleCellNet <- as.vector(unlist(True_Labels_singleCellNet))\r\n  Pred_Labels_singleCellNet <- as.vector(unlist(Pred_Labels_singleCellNet))\r\n  Training_Time_singleCellNet <- as.vector(unlist(Training_Time_singleCellNet))\r\n  Testing_Time_singleCellNet <- as.vector(unlist(Testing_Time_singleCellNet))\r\n  write.csv(True_Labels_singleCellNet,paste0(OutputDir,'/singleCellNet_true.csv'),row.names = FALSE)\r\n  write.csv(Pred_Labels_singleCellNet,paste0(OutputDir,'/singleCellNet_pred.csv'),row.names = FALSE)\r\n  write.csv(Training_Time_singleCellNet,paste0(OutputDir,'/singleCellNet_training_time.csv'),row.names = FALSE)\r\n  write.csv(Testing_Time_singleCellNet,paste0(OutputDir,'/singleCellNet_test_time.csv'),row.names = FALSE)\r\n}\r\n\r\nif (args[6] == \"0\") {\r\n  run_singleCellNet(args[1], args[2], args[3], args[4])\r\n} else {\r\n  run_singleCellNet(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6]))\r\n}\r\n"
  },
  {
    "path": "Snakemake/Snakefile",
    "content": "dockerTag = \"latest\" #FIXME tagged versions\n\ndef feature_ranking(w):\n    if \"feature_ranking\" in config.keys():\n        return config[\"feature_ranking\"]\n    else:\n        return \"{output_dir}/rank_genes_dropouts.csv\".format(\n            output_dir=w.output_dir)\n\n\"\"\"\nOne rule to... rule... them all...\n\"\"\"\nrule all:\n  input:\n    tool_outputs = expand(\n        \"{output_dir}/evaluation/{measure}/{tool}.csv\",\n        tool=config[\"tools_to_run\"],\n        output_dir=config[\"output_dir\"],\n        measure=[\"Confusion\", \"F1\", \"PopSize\", \"Summary\"])\n\n\n\"\"\"\nRule for the result evaluation\n\"\"\"\nrule evaluate:\n  input:\n    true=\"{output_dir}/{tool}/{tool}_true.csv\",\n    pred=\"{output_dir}/{tool}/{tool}_pred.csv\"\n  output:\n    \"{output_dir}/evaluation/Confusion/{tool}.csv\",\n    \"{output_dir}/evaluation/F1/{tool}.csv\",\n    \"{output_dir}/evaluation/PopSize/{tool}.csv\",\n    \"{output_dir}/evaluation/Summary/{tool}.csv\",\n  log: \"{output_dir}/evaluation/{tool}.log\"\n  singularity: \"docker://scrnaseqbenchmark/baseline:{}\".format(dockerTag)\n  shell:\n    \"Rscript evaluate.R \"\n    \"{input.true} \"\n    \"{input.pred} \"\n    \"{wildcards.output_dir}/evaluation \"\n    \"{wildcards.tool} \"\n    \"&> {log}\"\n\n\n\"\"\"\nRule for creating cross validation folds\n\"\"\"\nrule generate_CV_folds:\n  input: config[\"labfile\"],\n  output: \"{output_dir}/CV_folds.RData\"\n  log: \"{output_dir}/CV_folds.log\"\n  params:\n    column = config.get(\"column\", 1) # default to 1\n  singularity: \"docker://scrnaseqbenchmark/cross_validation:{}\".format(dockerTag)\n  shell:\n    \"Rscript Cross_Validation.R \"\n    \"{input} \"\n    \"{params.column} \"\n    \"{wildcards.output_dir} \"\n    \"&> {log}\"\n\n\n\"\"\"\nRule for creating feature rank lists\n\"\"\"\nrule generate_dropouts_feature_rankings:\n    input:\n        datafile = config[\"datafile\"],\n        folds = \"{output_dir}/CV_folds.RData\"\n    output: \"{output_dir}/rank_genes_dropouts.csv\"\n    log: \"{output_dir}/rank_genes_dropouts.log\"\n    singularity: \"docker://scrnaseqbenchmark/baseline:{}\".format(dockerTag)\n    shell:\n        \"echo test > {wildcards.output_dir}/test\\n\"\n        \"python3 rank_gene_dropouts.py \"\n        \"{input.datafile} \"\n        \"{input.folds} \"\n        \"{wildcards.output_dir} \"\n        \"&> {log}\"\n\n\n\"\"\"\nRule for R based tools.\n\"\"\"\nrule singleCellNet:\n  input:\n    datafile = config[\"datafile\"],\n    labfile = config[\"labfile\"],\n    folds = \"{output_dir}/CV_folds.RData\",\n    ranking = feature_ranking\n  output:\n    pred = \"{output_dir}/singleCellNet/singleCellNet_pred.csv\",\n    true = \"{output_dir}/singleCellNet/singleCellNet_true.csv\",\n    test_time = \"{output_dir}/singleCellNet/singleCellNet_test_time.csv\",\n    training_time = \"{output_dir}/singleCellNet/singleCellNet_training_time.csv\"\n  log: \"{output_dir}/singleCellNet/singleCellNet.log\"\n  params:\n    n_features = config.get(\"number_of_features\", 0)\n  singularity: \"docker://scrnaseqbenchmark/singlecellnet:{}\".format(dockerTag)\n  shell:\n    \"Rscript Scripts/run_singleCellNet.R \"\n    \"{input.datafile} \"\n    \"{input.labfile} \"\n    \"{input.folds} \"\n    \"{wildcards.output_dir}/singleCellNet \"\n    \"{input.ranking} \"\n    \"{params.n_features} \"\n    \"&> {log}\"\n\nrule scmapcell:\n  input:\n    datafile = config[\"datafile\"],\n    labfile = config[\"labfile\"],\n    folds = \"{output_dir}/CV_folds.RData\",\n    ranking = feature_ranking\n  output:\n    pred = \"{output_dir}/scmapcell/scmapcell_pred.csv\",\n    true = \"{output_dir}/scmapcell/scmapcell_true.csv\",\n    test_time = \"{output_dir}/scmapcell/scmapcell_test_time.csv\",\n    training_time = \"{output_dir}/scmapcell/scmapcell_training_time.csv\"\n  log: \"{output_dir}/scmapcell/scmapcell.log\"\n  params:\n    n_features = config.get(\"number_of_features\", 0)\n  singularity: \"docker://scrnaseqbenchmark/scmap:{}\".format(dockerTag)\n  shell:\n    \"Rscript Scripts/run_scmapcell.R \"\n    \"{input.datafile} \"\n    \"{input.labfile} \"\n    \"{input.folds} \"\n    \"{wildcards.output_dir}/scmapcell \"\n    \"{input.ranking} \"\n    \"{params.n_features} \"\n    \"&> {log}\"\n\nrule scmapcluster:\n  input:\n    datafile = config[\"datafile\"],\n    labfile = config[\"labfile\"],\n    folds = \"{output_dir}/CV_folds.RData\",\n    ranking = feature_ranking\n  output:\n    pred = \"{output_dir}/scmapcluster/scmapcluster_pred.csv\",\n    true = \"{output_dir}/scmapcluster/scmapcluster_true.csv\",\n    test_time = \"{output_dir}/scmapcluster/scmapcluster_test_time.csv\",\n    training_time = \"{output_dir}/scmapcluster/scmapcluster_training_time.csv\"\n  log: \"{output_dir}/scmapcluster/scmapcluster.log\"\n  params:\n    n_features = config.get(\"number_of_features\", 0)\n  singularity: \"docker://scrnaseqbenchmark/scmap:{}\".format(dockerTag)\n  shell:\n    \"Rscript Scripts/run_scmapcluster.R \"\n    \"{input.datafile} \"\n    \"{input.labfile} \"\n    \"{input.folds} \"\n    \"{wildcards.output_dir}/scmapcluster \"\n    \"{input.ranking} \"\n    \"{params.n_features} \"\n    \"&> {log}\"\n\nrule scID:\n  input:\n    datafile = config[\"datafile\"],\n    labfile = config[\"labfile\"],\n    folds = \"{output_dir}/CV_folds.RData\",\n    ranking = feature_ranking\n  output:\n    pred = \"{output_dir}/scID/scID_pred.csv\",\n    true = \"{output_dir}/scID/scID_true.csv\",\n    total_time = \"{output_dir}/scID/scID_total_time.csv\"\n  log: \"{output_dir}/scID/scID.log\"\n  params:\n    n_features = config.get(\"number_of_features\", 0)\n  singularity: \"docker://scrnaseqbenchmark/scid:{}\".format(dockerTag)\n  shell:\n    \"Rscript Scripts/run_scID.R \"\n    \"{input.datafile} \"\n    \"{input.labfile} \"\n    \"{input.folds} \"\n    \"{wildcards.output_dir}/scID \"\n    \"{input.ranking} \"\n    \"{params.n_features} \"\n    \"&> {log}\"\n\nrule CHETAH:\n  input:\n    datafile = config[\"datafile\"],\n    labfile = config[\"labfile\"],\n    folds = \"{output_dir}/CV_folds.RData\",\n    ranking = feature_ranking\n  output:\n    pred = \"{output_dir}/CHETAH/CHETAH_pred.csv\",\n    true = \"{output_dir}/CHETAH/CHETAH_true.csv\",\n    total_time = \"{output_dir}/CHETAH/CHETAH_total_time.csv\"\n  log: \"{output_dir}/CHETAH/CHETAH.log\"\n  params:\n    n_features = config.get(\"number_of_features\", 0)\n  singularity: \"docker://scrnaseqbenchmark/chetah:{}\".format(dockerTag)\n  shell:\n    \"Rscript Scripts/run_CHETAH.R \"\n    \"{input.datafile} \"\n    \"{input.labfile} \"\n    \"{input.folds} \"\n    \"{wildcards.output_dir}/CHETAH \"\n    \"{input.ranking} \"\n    \"{params.n_features} \"\n    \"&> {log}\"\n\nrule SingleR:\n  input:\n    datafile = config[\"datafile\"],\n    labfile = config[\"labfile\"],\n    folds = \"{output_dir}/CV_folds.RData\",\n    ranking = feature_ranking\n  output:\n    pred = \"{output_dir}/SingleR/SingleR_pred.csv\",\n    true = \"{output_dir}/SingleR/SingleR_true.csv\",\n    total_time = \"{output_dir}/SingleR/SingleR_total_time.csv\"\n  log: \"{output_dir}/SingleR/SingleR.log\"\n  params:\n    n_features = config.get(\"number_of_features\", 0)\n  singularity: \"docker://scrnaseqbenchmark/singler:{}\".format(dockerTag)\n  shell:\n    \"Rscript Scripts/run_SingleR.R \"\n    \"{input.datafile} \"\n    \"{input.labfile} \"\n    \"{input.folds} \"\n    \"{wildcards.output_dir}/SingleR \"\n    \"{input.ranking} \"\n    \"{params.n_features} \"\n    \"&> {log}\"\n\n#NOTE non-conformant to the rest of the rules.\nrule Garnett_CV:\n  input:\n    datafile = config[\"datafile\"],\n    labfile = config[\"labfile\"],\n    folds = \"{output_dir}/CV_folds.RData\",\n    genes_names = config.get(\"genes\", \"UNSPECIFIEDFILE\"),\n    markers = config.get(\"Garnett_CV\", {}).get(\n        \"markers\", \"UNSPECIFIEDFILE\")\n  output:\n    pred = \"{output_dir}/Garnett_CV/Garnett_CV_pred.csv\",\n    true = \"{output_dir}/Garnett_CV/Garnett_CV_true.csv\",\n    test_time = \"{output_dir}/Garnett_CV/Garnett_CV_test_time.csv\",\n    training_time = \"{output_dir}/Garnett_CV/Garnett_CV_training_time.csv\"\n  log: \"{output_dir}/Garnett_CV/Garnett_CV.log\"\n  params:\n    human = \"T\" if config.get(\"human\", True) else \"F\"\n  singularity: \"docker://scrnaseqbenchmark/garnett:{}\".format(dockerTag)\n  shell:\n    \"Rscript Scripts/run_Garnett_CV.R \"\n    \"{input.datafile} \"\n    \"{input.labfile} \"\n    \"{input.folds} \"\n    \"{input.genes_names} \"\n    \"{input.markers} \"\n    \"{wildcards.output_dir}/Garnett_CV \"\n    \"{params.human} \"\n    \"&> {log}\"\n\n#NOTE non-conformant to the rest of the rules.\nrule Garnett_Pretrained: #TODO test this\n  input:\n    datafile = config[\"datafile\"],\n    labfile = config[\"labfile\"],\n    folds = \"{output_dir}/CV_folds.RData\",\n    genes_names = config.get(\"genes\", \"UNSPECIFIEDFILE\"),\n    classifier = config.get(\"Garnett_Pretrained\", {}).get(\n        \"classifier\", \"UNSPECIFIEDFILE\")\n  output:\n    pred = \"{output_dir}/Garnett_Pretrained/Garnett_Pretrained_pred.csv\",\n    true = \"{output_dir}/Garnett_Pretrained/Garnett_Pretrained_true.csv\",\n    test_time = \"{output_dir}/Garnett_Pretrained/Garnett_Pretrained_test_time.csv\"\n  log: \"{output_dir}/Garnett_Pretrained/Garnett_Pretrained.log\"\n  params:\n    human = \"T\" if config.get(\"human\", True) else \"F\"\n  singularity: \"docker://scrnaseqbenchmark/garnett:{}\".format(dockerTag)\n  shell:\n    \"Rscript Scripts/run_Garnett_Pretrained.R \"\n    \"{input.datafile} \"\n    \"{input.labfile} \"\n    \"{input.genes_names} \"\n    \"{input.folds} \"\n    \"{input.classifier} \"\n    \"{wildcards.output_dir}/Garnett_Pretrained \"\n    \"{params.human} \"\n    \"&> {log}\"\n\n\n\"\"\"\nRules for python based tools.\n\"\"\"\nrule kNN50:\n  input:\n    datafile = config[\"datafile\"],\n    labfile = config[\"labfile\"],\n    folds = \"{output_dir}/CV_folds.RData\",\n    ranking = feature_ranking\n  output:\n    pred = \"{output_dir}/kNN50/kNN50_pred.csv\",\n    true = \"{output_dir}/kNN50/kNN50_true.csv\",\n    test_time = \"{output_dir}/kNN50/kNN50_test_time.csv\",\n    training_time = \"{output_dir}/kNN50/kNN50_training_time.csv\"\n  log: \"{output_dir}/kNN50/kNN50.log\"\n  params:\n    n_features = config.get(\"number_of_features\", 0)\n  singularity: \"docker://scrnaseqbenchmark/baseline:{}\".format(dockerTag)\n  shell:\n    \"python3 Scripts/run_kNN50.py \"\n    \"{input.datafile} \"\n    \"{input.labfile} \"\n    \"{input.folds} \"\n    \"{wildcards.output_dir}/kNN50 \"\n    \"{input.ranking} \"\n    \"{params.n_features} \"\n    \"&> {log}\"\n\nrule kNN9:\n  input:\n    datafile = config[\"datafile\"],\n    labfile = config[\"labfile\"],\n    folds = \"{output_dir}/CV_folds.RData\",\n    ranking = feature_ranking\n  output:\n    pred = \"{output_dir}/kNN9/kNN9_pred.csv\",\n    true = \"{output_dir}/kNN9/kNN9_true.csv\",\n    test_time = \"{output_dir}/kNN9/kNN9_test_time.csv\",\n    training_time = \"{output_dir}/kNN9/kNN9_training_time.csv\"\n  log: \"{output_dir}/kNN9/kNN9.log\"\n  params:\n    n_features = config.get(\"number_of_features\", 0)\n  singularity: \"docker://scrnaseqbenchmark/baseline:{}\".format(dockerTag)\n  shell:\n    \"python3 Scripts/run_kNN9.py \"\n    \"{input.datafile} \"\n    \"{input.labfile} \"\n    \"{input.folds} \"\n    \"{wildcards.output_dir}/kNN9 \"\n    \"{input.ranking} \"\n    \"{params.n_features} \"\n    \"&> {log}\"\n\nrule Cell_BLAST:\n  input:\n    datafile = config[\"datafile\"],\n    labfile = config[\"labfile\"],\n    folds = \"{output_dir}/CV_folds.RData\",\n    ranking = feature_ranking\n  output:\n    pred = \"{output_dir}/Cell_BLAST/Cell_BLAST_pred.csv\",\n    true = \"{output_dir}/Cell_BLAST/Cell_BLAST_true.csv\",\n    test_time = \"{output_dir}/Cell_BLAST/Cell_BLAST_test_time.csv\",\n    training_time = \"{output_dir}/Cell_BLAST/Cell_BLAST_training_time.csv\"\n  log: \"{output_dir}/Cell_BLAST/Cell_BLAST.log\"\n  params:\n    n_features = config.get(\"number_of_features\", 0)\n  singularity: \"docker://scrnaseqbenchmark/cell_blast:{}\".format(dockerTag)\n  shell:\n    \"python3 Scripts/run_Cell_BLAST.py \"\n    \"{input.datafile} \"\n    \"{input.labfile} \"\n    \"{input.folds} \"\n    \"{wildcards.output_dir}/Cell_BLAST \"\n    \"{input.ranking} \"\n    \"{params.n_features} \"\n    \"&> {log}\"\n\nrule scVI:\n  input:\n    datafile = config[\"datafile\"],\n    labfile = config[\"labfile\"],\n    folds = \"{output_dir}/CV_folds.RData\",\n    ranking = feature_ranking\n  output:\n    pred = \"{output_dir}/scVI/scVI_pred.csv\",\n    true = \"{output_dir}/scVI/scVI_true.csv\",\n    test_time = \"{output_dir}/scVI/scVI_test_time.csv\",\n    training_time = \"{output_dir}/scVI/scVI_training_time.csv\"\n  log: \"{output_dir}/scVI/scVI.log\"\n  params:\n    n_features = config.get(\"number_of_features\", 0)\n  singularity: \"docker://scrnaseqbenchmark/scvi:{}\".format(dockerTag)\n  shell:\n    \"python3 Scripts/run_scVI.py \"\n    \"{input.datafile} \"\n    \"{input.labfile} \"\n    \"{input.folds} \"\n    \"{wildcards.output_dir}/scVI \"\n    \"{input.ranking} \"\n    \"{params.n_features} \"\n    \"&> {log}\"\n\nrule LDA:\n  input:\n    datafile = config[\"datafile\"],\n    labfile = config[\"labfile\"],\n    folds = \"{output_dir}/CV_folds.RData\",\n    ranking = feature_ranking\n  output:\n    pred = \"{output_dir}/LDA/LDA_pred.csv\",\n    true = \"{output_dir}/LDA/LDA_true.csv\",\n    test_time = \"{output_dir}/LDA/LDA_test_time.csv\",\n    training_time = \"{output_dir}/LDA/LDA_training_time.csv\"\n  log: \"{output_dir}/LDA/LDA.log\"\n  params:\n    n_features = config.get(\"number_of_features\", 0)\n  singularity: \"docker://scrnaseqbenchmark/baseline:{}\".format(dockerTag)\n  shell:\n    \"python3 Scripts/run_LDA.py \"\n    \"{input.datafile} \"\n    \"{input.labfile} \"\n    \"{input.folds} \"\n    \"{wildcards.output_dir}/LDA \"\n    \"{input.ranking} \"\n    \"{params.n_features} \"\n    \"&> {log}\"\n\nrule LDA_rejection:\n  input:\n    datafile = config[\"datafile\"],\n    labfile = config[\"labfile\"],\n    folds = \"{output_dir}/CV_folds.RData\",\n    ranking = feature_ranking\n  output:\n    pred = \"{output_dir}/LDA_rejection/LDA_rejection_pred.csv\",\n    true = \"{output_dir}/LDA_rejection/LDA_rejection_true.csv\",\n    test_time = \"{output_dir}/LDA_rejection/LDA_rejection_test_time.csv\",\n    training_time = \"{output_dir}/LDA_rejection/LDA_rejection_training_time.csv\"\n  log: \"{output_dir}/LDA_rejection/LDA_rejection.log\"\n  params:\n    n_features = config.get(\"number_of_features\", 0)\n  singularity: \"docker://scrnaseqbenchmark/baseline:{}\".format(dockerTag)\n  shell:\n    \"python3 Scripts/run_LDA_rejection.py \"\n    \"{input.datafile} \"\n    \"{input.labfile} \"\n    \"{input.folds} \"\n    \"{wildcards.output_dir}/LDA_rejection \"\n    \"{input.ranking} \"\n    \"{params.n_features} \"\n    \"&> {log}\"\n\nrule NMC:\n  input:\n    datafile = config[\"datafile\"],\n    labfile = config[\"labfile\"],\n    folds = \"{output_dir}/CV_folds.RData\",\n    ranking = feature_ranking\n  output:\n    pred = \"{output_dir}/NMC/NMC_pred.csv\",\n    true = \"{output_dir}/NMC/NMC_true.csv\",\n    test_time = \"{output_dir}/NMC/NMC_test_time.csv\",\n    training_time = \"{output_dir}/NMC/NMC_training_time.csv\"\n  log: \"{output_dir}/NMC/NMC.log\"\n  params:\n    n_features = config.get(\"number_of_features\", 0)\n  singularity: \"docker://scrnaseqbenchmark/baseline:{}\".format(dockerTag)\n  shell:\n    \"python3 Scripts/run_NMC.py \"\n    \"{input.datafile} \"\n    \"{input.labfile} \"\n    \"{input.folds} \"\n    \"{wildcards.output_dir}/NMC \"\n    \"{input.ranking} \"\n    \"{params.n_features} \"\n    \"&> {log}\"\n\nrule RF:\n  input:\n    datafile = config[\"datafile\"],\n    labfile = config[\"labfile\"],\n    folds = \"{output_dir}/CV_folds.RData\",\n    ranking = feature_ranking\n  output:\n    pred = \"{output_dir}/RF/RF_pred.csv\",\n    true = \"{output_dir}/RF/RF_true.csv\",\n    test_time = \"{output_dir}/RF/RF_test_time.csv\",\n    training_time = \"{output_dir}/RF/RF_training_time.csv\"\n  log: \"{output_dir}/RF/RF.log\"\n  params:\n    n_features = config.get(\"number_of_features\", 0)\n  singularity: \"docker://scrnaseqbenchmark/baseline:{}\".format(dockerTag)\n  shell:\n    \"python3 Scripts/run_RF.py \"\n    \"{input.datafile} \"\n    \"{input.labfile} \"\n    \"{input.folds} \"\n    \"{wildcards.output_dir}/RF \"\n    \"{input.ranking} \"\n    \"{params.n_features} \"\n    \"&> {log}\"\n\nrule SVM:\n  input:\n    datafile = config[\"datafile\"],\n    labfile = config[\"labfile\"],\n    folds = \"{output_dir}/CV_folds.RData\",\n    ranking = feature_ranking\n  output:\n    pred = \"{output_dir}/SVM/SVM_pred.csv\",\n    true = \"{output_dir}/SVM/SVM_true.csv\",\n    test_time = \"{output_dir}/SVM/SVM_test_time.csv\",\n    training_time = \"{output_dir}/SVM/SVM_training_time.csv\"\n  log: \"{output_dir}/SVM/SVM.log\"\n  params:\n    n_features = config.get(\"number_of_features\", 0)\n  singularity: \"docker://scrnaseqbenchmark/baseline:{}\".format(dockerTag)\n  shell:\n    \"python3 Scripts/run_SVM.py \"\n    \"{input.datafile} \"\n    \"{input.labfile} \"\n    \"{input.folds} \"\n    \"{wildcards.output_dir}/SVM \"\n    \"{input.ranking} \"\n    \"{params.n_features} \"\n    \"&> {log}\"\n\nrule SVM_rejection:\n  input:\n    datafile = config[\"datafile\"],\n    labfile = config[\"labfile\"],\n    folds = \"{output_dir}/CV_folds.RData\",\n    ranking = feature_ranking\n  output:\n    pred = \"{output_dir}/SVM_rejection/SVM_rejection_pred.csv\",\n    true = \"{output_dir}/SVM_rejection/SVM_rejection_true.csv\",\n    test_time = \"{output_dir}/SVM_rejection/SVM_rejection_test_time.csv\",\n    training_time = \"{output_dir}/SVM_rejection/SVM_rejection_training_time.csv\"\n  log: \"{output_dir}/SVM_rejection/SVM_rejection.log\"\n  params:\n    n_features = config.get(\"number_of_features\", 0)\n  singularity: \"docker://scrnaseqbenchmark/baseline:{}\".format(dockerTag)\n  shell:\n    \"python3 Scripts/run_SVM_rejection.py \"\n    \"{input.datafile} \"\n    \"{input.labfile} \"\n    \"{input.folds} \"\n    \"{wildcards.output_dir}/SVM_rejection \"\n    \"{input.ranking} \"\n    \"{params.n_features} \"\n    \"&> {log}\"\n"
  },
  {
    "path": "Snakemake/evaluate.R",
    "content": "args <- commandArgs(TRUE)\r\n\r\nTrueLabelsPath <- args[1]\r\nPredLabelsPath <- args[2]\r\nOutputDir <- args[3]\r\nToolName <- args[4]\r\n\r\nevaluate <- function(TrueLabelsPath, PredLabelsPath, Indices = NULL){\r\n  \"\r\n  Script to evaluate the performance of the classifier.\r\n  It returns multiple evaluation measures: the confusion matrix, median F1-score, F1-score for each class, accuracy, percentage of unlabeled, population size. \r\n  \r\n  The percentage of unlabeled cells is find by checking for cells that are labeled 'Unassigned', 'unassigned', 'Unknown', 'unknown', 'Nodexx', 'rand', or 'ambiguous'.\r\n  \r\n  Parameters\r\n  ----------\r\n  TrueLabelsPath: csv file with the true labels (format: one column, no index)\r\n  PredLabelsPath: csv file with the predicted labels (format: one column, no index)\r\n  Indices: which part of the csv file should be read (e.g. if more datasets are tested at the same time) (format: c(begin, end))\r\n  \r\n  Returns\r\n  -------\r\n  Conf: confusion matrix\r\n  MedF1 : median F1-score\r\n  F1 : F1-score per class\r\n  Acc : accuracy\r\n  PercUnl : percentage of unlabeled cells\r\n  PopSize : number of cells per cell type\r\n  \"\r\n  \r\n  true_lab <- unlist(read.csv(TrueLabelsPath))\r\n  pred_lab <- unlist(read.csv(PredLabelsPath))\r\n  \r\n  if (! is.null(Indices)){\r\n    true_lab <- true_lab[Indices]\r\n    pred_lab <- pred_lab[Indices]\r\n  }\r\n  \r\n  unique_true <- unlist(unique(true_lab))\r\n  unique_pred <- unlist(unique(pred_lab))\r\n  \r\n  unique_all <- unique(c(unique_true,unique_pred))\r\n  conf <- table(true_lab,pred_lab)\r\n  pop_size <- rowSums(conf)\r\n  \r\n  pred_lab = gsub('Node..','Node',pred_lab)\r\n  \r\n  conf_F1 <- table(true_lab,pred_lab,exclude = c('unassigned','Unassigned','Unknown','rand','Node','ambiguous','unknown'))\r\n\r\n  F1 <- vector()\r\n  sum_acc <- 0\r\n  \r\n  for (i in c(1:length(unique_true))){\r\n    findLabel = colnames(conf_F1) == row.names(conf_F1)[i]\r\n    if(sum(findLabel)){\r\n      prec <- conf_F1[i,findLabel] / colSums(conf_F1)[findLabel]\r\n      rec <- conf_F1[i,findLabel] / rowSums(conf_F1)[i]\r\n      if (prec == 0 || rec == 0){\r\n        F1[i] = 0\r\n      } else{\r\n        F1[i] <- (2*prec*rec) / (prec + rec)\r\n      }\r\n      sum_acc <- sum_acc + conf_F1[i,findLabel]\r\n    } else {\r\n      F1[i] = 0\r\n    }\r\n  }\r\n  \r\n  pop_size <- pop_size[pop_size > 0]\r\n  \r\n  names(F1) <- names(pop_size)\r\n  \r\n  med_F1 <- median(F1)\r\n  \r\n  total <- length(pred_lab)\r\n  num_unlab <- sum(pred_lab == 'unassigned') + sum(pred_lab == 'Unassigned') + sum(pred_lab == 'rand') + sum(pred_lab == 'Unknown') + sum(pred_lab == 'unknown') + sum(pred_lab == 'Node') + sum(pred_lab == 'ambiguous')\r\n  per_unlab <- num_unlab / total\r\n  \r\n  acc <- sum_acc/sum(conf_F1)\r\n  \r\n  result <- list(Conf = conf, MedF1 = med_F1, F1 = F1, Acc = acc, PercUnl = per_unlab, PopSize = pop_size)\r\n  \r\n  return(result)\r\n}\r\n\r\nresults <- evaluate(TrueLabelsPath, PredLabelsPath)\r\nwrite.csv(results$Conf, file.path(OutputDir, \"Confusion\", paste0(ToolName, \".csv\")))\r\nwrite.csv(results$F1, file.path(OutputDir, \"F1\", paste0(ToolName, \".csv\")))\r\nwrite.csv(results$PopSize, file.path(OutputDir, \"PopSize\", paste0(ToolName, \".csv\")))\r\ndf <- data.frame(results[c(\"MedF1\", \"Acc\", \"PercUnl\")])\r\nwrite.csv(df, file.path(OutputDir, \"Summary\", paste0(ToolName, \".csv\")))\r\n"
  },
  {
    "path": "Snakemake/example.config.yml",
    "content": "output_dir: output\ndatafile: input/data.csv\nlabfile: input/Labels.csv\ncolumn: 1\nnumber_of_features: 0\ntools_to_run:\n  - Cell_BLAST\n  - scVI\n  - scmapcell \n"
  },
  {
    "path": "Snakemake/rank_gene_dropouts.py",
    "content": "import os\r\nfrom sys import argv\r\nfrom pathlib import Path\r\n\r\nimport rpy2.robjects as robjects\r\nimport numpy as np\r\nimport pandas as pd\r\nfrom sklearn import linear_model\r\n\r\n\r\ndef rank_gene_dropouts(DataPath, CV_RDataPath, OutputDir):\r\n    '''\r\n    Script to rank the genes in the training set of the inputfile based on their dropout level.\r\n    This rank is written to a file.\r\n\r\n    Parameters\r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes\r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    '''\r\n\r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    train_ind = np.array(robjects.r['Train_Idx'])\r\n\r\n    # read the data\r\n    data = pd.read_csv(DataPath,index_col=0,sep=',')\r\n    data = data.iloc[tokeep]\r\n    data = np.log2(data+1)\r\n\r\n    genes = np.zeros([np.shape(data)[1],np.squeeze(nfolds)], dtype = '>U10')\r\n\r\n    for i in range(np.squeeze(nfolds)):\r\n        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1\r\n        train=data.iloc[train_ind_i]\r\n        train.columns = np.arange(len(train.columns))\r\n\r\n        # rank genes training set\r\n        dropout = (train == 0).sum(axis='rows')\r\n        dropout = (dropout / train.shape[0]) * 100\r\n        mean = train.mean(axis='rows')\r\n\r\n        notzero = np.where((np.array(mean) > 0) & (np.array(dropout) > 0))[0]\r\n        zero = np.where(~((np.array(mean) > 0) & (np.array(dropout) > 0)))[0]\r\n        train_notzero = train.iloc[:,notzero]\r\n        train_zero = train.iloc[:,zero]\r\n        zero_genes = train_zero.columns\r\n\r\n        dropout = dropout.iloc[notzero]\r\n        mean = mean.iloc[notzero]\r\n\r\n        dropout = np.log2(np.array(dropout)).reshape(-1,1)\r\n        mean = np.array(mean).reshape(-1,1)\r\n        reg = linear_model.LinearRegression()\r\n        reg.fit(mean,dropout)\r\n\r\n        residuals = dropout - reg.predict(mean)\r\n        residuals = pd.Series(np.array(residuals).ravel(),index=train_notzero.columns)\r\n        residuals = residuals.sort_values(ascending=False)\r\n        sorted_genes = residuals.index\r\n        sorted_genes = sorted_genes.append(zero_genes)\r\n\r\n        genes[:,i] = sorted_genes.values\r\n\r\n\r\n    genes = pd.DataFrame(genes)\r\n\r\n    genes.to_csv(str(OutputDir / Path(\"rank_genes_dropouts.csv\")), index = False)\r\n\r\nrank_gene_dropouts(argv[1], argv[2], argv[3])\r\n"
  },
  {
    "path": "evaluate.R",
    "content": "evaluate <- function(TrueLabelsPath, PredLabelsPath, Indices = NULL){\r\n  \"\r\n  Script to evaluate the performance of the classifier.\r\n  It returns multiple evaluation measures: the confusion matrix, median F1-score, F1-score for each class, accuracy, percentage of unlabeled, population size. \r\n  \r\n  The percentage of unlabeled cells is find by checking for cells that are labeled 'Unassigned', 'unassigned', 'Unknown', 'unknown', 'Nodexx', 'rand', or 'ambiguous'.\r\n  \r\n  Parameters\r\n  ----------\r\n  TrueLabelsPath: csv file with the true labels (format: one column, no index)\r\n  PredLabelsPath: csv file with the predicted labels (format: one column, no index)\r\n  Indices: which part of the csv file should be read (e.g. if more datasets are tested at the same time) (format: c(begin, end))\r\n  \r\n  Returns\r\n  -------\r\n  Conf: confusion matrix\r\n  MedF1 : median F1-score\r\n  F1 : F1-score per class\r\n  Acc : accuracy\r\n  PercUnl : percentage of unlabeled cells\r\n  PopSize : number of cells per cell type\r\n  \"\r\n  \r\n  true_lab <- unlist(read.csv(TrueLabelsPath))\r\n  pred_lab <- unlist(read.csv(PredLabelsPath))\r\n  \r\n  if (! is.null(Indices)){\r\n    true_lab <- true_lab[Indices]\r\n    pred_lab <- pred_lab[Indices]\r\n  }\r\n  \r\n  unique_true <- unlist(unique(true_lab))\r\n  unique_pred <- unlist(unique(pred_lab))\r\n  \r\n  unique_all <- unique(c(unique_true,unique_pred))\r\n  conf <- table(true_lab,pred_lab)\r\n  pop_size <- rowSums(conf)\r\n  \r\n  pred_lab = gsub('Node..','Node',pred_lab)\r\n  \r\n  conf_F1 <- table(true_lab,pred_lab,exclude = c('unassigned','Unassigned','Unknown','rand','Node','ambiguous','unknown'))\r\n\r\n  F1 <- vector()\r\n  sum_acc <- 0\r\n  \r\n  for (i in c(1:length(unique_true))){\r\n    findLabel = colnames(conf_F1) == row.names(conf_F1)[i]\r\n    if(sum(findLabel)){\r\n      prec <- conf_F1[i,findLabel] / colSums(conf_F1)[findLabel]\r\n      rec <- conf_F1[i,findLabel] / rowSums(conf_F1)[i]\r\n      if (prec == 0 || rec == 0){\r\n        F1[i] = 0\r\n      } else{\r\n        F1[i] <- (2*prec*rec) / (prec + rec)\r\n      }\r\n      sum_acc <- sum_acc + conf_F1[i,findLabel]\r\n    } else {\r\n      F1[i] = 0\r\n    }\r\n  }\r\n  \r\n  pop_size <- pop_size[pop_size > 0]\r\n  \r\n  names(F1) <- names(pop_size)\r\n  \r\n  med_F1 <- median(F1)\r\n  \r\n  total <- length(pred_lab)\r\n  num_unlab <- sum(pred_lab == 'unassigned') + sum(pred_lab == 'Unassigned') + sum(pred_lab == 'rand') + sum(pred_lab == 'Unknown') + sum(pred_lab == 'unknown') + sum(pred_lab == 'Node') + sum(pred_lab == 'ambiguous')\r\n  per_unlab <- num_unlab / total\r\n  \r\n  acc <- sum_acc/sum(conf_F1)\r\n  \r\n  result <- list(Conf = conf, MedF1 = med_F1, F1 = F1, Acc = acc, PercUnl = per_unlab, PopSize = pop_size)\r\n  \r\n  return(result)\r\n}\r\n"
  },
  {
    "path": "rank_gene_dropouts.py",
    "content": "import os\r\nimport rpy2.robjects as robjects\r\nimport numpy as np\r\nimport pandas as pd\r\nfrom sklearn import linear_model\r\n\r\n\r\ndef rank_gene_dropouts(DataPath, CV_RDataPath, OutputDir):\r\n    '''\r\n    Script to rank the genes in the training set of the inputfile based on their dropout level.\r\n    This rank is written to a file.\r\n    \r\n    Parameters \r\n    ----------\r\n    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes \r\n    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.\r\n    OutputDir : Output directory defining the path of the exported file.\r\n    '''\r\n        \r\n    # read the Rdata file\r\n    robjects.r['load'](CV_RDataPath)\r\n\r\n    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')\r\n    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')\r\n    train_ind = np.array(robjects.r['Train_Idx'])\r\n    \r\n    # read the data\r\n    data = pd.read_csv(DataPath,index_col=0,sep=',')\r\n    data = data.iloc[tokeep]\r\n    data = np.log2(data+1)\r\n    \r\n    genes = np.zeros([np.shape(data)[1],np.squeeze(nfolds)], dtype = '>U10')\r\n        \r\n    for i in range(np.squeeze(nfolds)):\r\n        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1\r\n        train=data.iloc[train_ind_i]\r\n        train.columns = np.arange(len(train.columns))\r\n            \r\n        # rank genes training set \r\n        dropout = (train == 0).sum(axis='rows')\r\n        dropout = (dropout / train.shape[0]) * 100\r\n        mean = train.mean(axis='rows')\r\n            \r\n        notzero = np.where((np.array(mean) > 0) & (np.array(dropout) > 0))[0]\r\n        zero = np.where(~((np.array(mean) > 0) & (np.array(dropout) > 0)))[0]\r\n        train_notzero = train.iloc[:,notzero]\r\n        train_zero = train.iloc[:,zero]\r\n        zero_genes = train_zero.columns\r\n            \r\n        dropout = dropout.iloc[notzero]\r\n        mean = mean.iloc[notzero]\r\n    \r\n        dropout = np.log2(np.array(dropout)).reshape(-1,1)\r\n        mean = np.array(mean).reshape(-1,1)\r\n        reg = linear_model.LinearRegression()\r\n        reg.fit(mean,dropout)\r\n    \r\n        residuals = dropout - reg.predict(mean)\r\n        residuals = pd.Series(np.array(residuals).ravel(),index=train_notzero.columns)\r\n        residuals = residuals.sort_values(ascending=False)\r\n        sorted_genes = residuals.index\r\n        sorted_genes = sorted_genes.append(zero_genes)\r\n            \r\n        genes[:,i] = sorted_genes.values\r\n            \r\n    \r\n    genes = pd.DataFrame(genes)\r\n    \r\n    os.chdir(OutputDir)\r\n    genes.to_csv(\"rank_genes_dropouts.csv\", index = False)\r\n\r\n        \r\n\r\n"
  }
]