Repository: tabdelaal/scRNAseq_Benchmark
Branch: master
Commit: 553869b632f4
Files: 82
Total size: 288.4 KB

Directory structure:
gitextract_ikyozzhh/

├── Cross_Validation.R
├── DEgenesMAST.R
├── LICENSE
├── README.md
├── Scripts/
│   ├── run_ACTINN.py
│   ├── run_CHETAH.R
│   ├── run_CaSTLe.R
│   ├── run_Cell_BLAST.py
│   ├── run_DigitalCellSorter.py
│   ├── run_Garnett_CV.R
│   ├── run_Garnett_Pretrained.R
│   ├── run_LAmbDA.py
│   ├── run_LDA.py
│   ├── run_LDA_rejection.py
│   ├── run_NMC.py
│   ├── run_RF.py
│   ├── run_SCINA.R
│   ├── run_SVM.py
│   ├── run_SVM_rejection.py
│   ├── run_SingleR.R
│   ├── run_kNN50.py
│   ├── run_kNN9.py
│   ├── run_moana.py
│   ├── run_scID.R
│   ├── run_scPred.R
│   ├── run_scVI.py
│   ├── run_scmap.R
│   └── run_singleCellNet.R
├── Snakemake/
│   ├── Cross_Validation.R
│   ├── DEgenesMAST.R
│   ├── Dockerfiles/
│   │   ├── baseline/
│   │   │   └── Dockerfile
│   │   ├── cell_blast/
│   │   │   └── Dockerfile
│   │   ├── chetah/
│   │   │   ├── Dockerfile
│   │   │   └── install_packages.R
│   │   ├── cross_validation/
│   │   │   ├── Dockerfile
│   │   │   └── install_packages.R
│   │   ├── garnett/
│   │   │   ├── Dockerfile
│   │   │   └── install_packages.R
│   │   ├── scid/
│   │   │   ├── Dockerfile
│   │   │   └── install_packages.R
│   │   ├── scmap/
│   │   │   ├── Dockerfile
│   │   │   └── install_packages.R
│   │   ├── scvi/
│   │   │   └── Dockerfile
│   │   ├── singlecellnet/
│   │   │   ├── Dockerfile
│   │   │   └── install_packages.R
│   │   └── singler/
│   │       ├── Dockerfile
│   │       └── install_packages.R
│   ├── LICENSE
│   ├── README.md
│   ├── Scripts/
│   │   ├── run_ACTINN.py
│   │   ├── run_CHETAH.R
│   │   ├── run_CaSTLe.R
│   │   ├── run_Cell_BLAST.py
│   │   ├── run_DigitalCellSorter.py
│   │   ├── run_Garnett_CV.R
│   │   ├── run_Garnett_Pretrained.R
│   │   ├── run_LAmbDA.py
│   │   ├── run_LDA.py
│   │   ├── run_LDA_rejection.py
│   │   ├── run_NMC.py
│   │   ├── run_RF.py
│   │   ├── run_SCINA.R
│   │   ├── run_SVM.py
│   │   ├── run_SVM_rejection.py
│   │   ├── run_SingleR.R
│   │   ├── run_kNN50.py
│   │   ├── run_kNN9.py
│   │   ├── run_moana.py
│   │   ├── run_scID.R
│   │   ├── run_scPred.R
│   │   ├── run_scVI.py
│   │   ├── run_scmap.R
│   │   ├── run_scmapcell.R
│   │   ├── run_scmapcluster.R
│   │   ├── run_scmaptotal.R
│   │   └── run_singleCellNet.R
│   ├── Snakefile
│   ├── evaluate.R
│   ├── example.config.yml
│   └── rank_gene_dropouts.py
├── evaluate.R
└── rank_gene_dropouts.py

================================================
FILE CONTENTS
================================================

================================================
FILE: Cross_Validation.R
================================================
Cross_Validation <- function(LabelsPath, col_Index = 1,OutputDir){
  "
  Cross_Validation
  Function returns train and test indices for 5 folds stratified across unique cell populations,
  also filter out cell populations with less than 10 cells.
  It return a 'CV_folds.RData' file which then used as input to classifiers wrappers.
  
  Parameters
  ----------
  LabelsPath : Cell population annotations file path (.csv).
  col_Index : column index (integer) defining which level of annotation to use,
  in case of multiple cell type annotations (default is 1)
  OutputDir : Output directory defining the path of the exported file.
  "
  
  Labels <- as.matrix(read.csv(LabelsPath))
  Labels <- as.vector(Labels[,col_Index])
  
  Removed_classes <- !(table(Labels) > 10)
  Cells_to_Keep <- !(is.element(Labels,names(Removed_classes)[Removed_classes]))
  Labels <- Labels[Cells_to_Keep]
  
  # Getting training and testing Folds
  library(rBayesianOptimization)
  n_folds = 5
  Folds <- KFold(Labels,nfolds = n_folds, stratified = TRUE)
  Test_Folds <- c(n_folds:1)
  Train_Idx <- list()
  Test_Idx <- list()
  for (i in c(1:length(Folds))){
    Temp_Folds <- Folds
    Temp_Folds[Test_Folds[i]] <- NULL
    Train_Idx[i] <- list(unlist(Temp_Folds))
    Test_Idx[i] <- Folds[Test_Folds[i]]
  }
  remove(Temp_Folds,i,Folds)
  setwd(OutputDir)
  save(n_folds,Train_Idx,Test_Idx,col_Index,Cells_to_Keep,file = 'CV_folds.RData')
}

================================================
FILE: DEgenesMAST.R
================================================
DEgenesMAST <- function(Data, Labels, Normalize = FALSE, LogTransform = FALSE){
  # This functions applies a differential expression test to the data using one vs all
  # The training data should be used a an input
  # The output is a matrix with marker genes where the columns are the cell populations and the rows are the top20 marker genes
  # This output can be rewritten to the format of the prior-knowledge-supervised classifiers and afterwards be used to classify the test set.
  
  # Data: genes X cells (rows = genes, columns = cells)
  # Labels: labels of the data
  # Normalize: the input for MAST should be cpm normalized data, 
  #            if the data is not normalized yet, this should be set to TRUE
  # LogTransform: the input for MAST should be logtransformed,
  #            if the data is not logtransformed yet, this should be set to TRUE
  
  
  library(Seurat)
  
  if(Normalize)
  {
    Data <- apply(Data, 2, function(x) (x/sum(x))*1000000)
  }
  
  if(LogTransform)
  {
    Data <- log(Data+1, base = 2)
  }
  SeuObj <- CreateSeuratObject(raw.data = Data, project = "DEgenes")
  SeuObj <- SetIdent(SeuObj, ident.use = Labels)
  DEgenes <- FindAllMarkers(SeuObj, test.use = "MAST")
  Markers <- matrix(nrow = 20,ncol = length(unique(Labels)))
  colnames(Markers) <- unique(Labels)
  for (i in unique(Labels)){
    i
    TempList <- DEgenes$gene[((DEgenes$cluster == i) & (DEgenes$avg_logFC > 0))]
    MarkerGenes <- DEgenes$p_val_adj[DEgenes$cluster == i]
    print(MarkerGenes[1:20])
    if (length(TempList) >= 20){
      Markers[,i] <- TempList[1:20]
    }
    else{
      if(length(TempList) > 0){
        Markers[c(1:length(TempList)),i] <- TempList
      }
    }
  }
  return(Markers)
}


================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2019 tabdelaal

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
# A comparison of automatic cell identification methods for single-cell RNA-sequencing data
We present a comprehensive evaluation of the performance of state-of-the-art classification methods, in addition to general-purpose classifiers, for automatic cell identification single cell RNA-sequencing datasets. Our goal is to provide the community with a fair evaluation of all available methods to facilitate the users’ choice as well as direct further developments to focus on the challenging aspects of automated cell type identification. (published in genome biology Sep. 2019 https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1795-z)

### Repository description
We provide all the scripts to run and evaluate all classifiers, and to reproduce the results introduced in the paper.


1. 'Scripts' folder contains a wrapper function to read the data and apply certain classification method.
2. ```Cross_Validation``` R script can be used to produce training and test indices for cross validation.
3. ```rank_gene_dropouts``` Python script can be used to apply feature selection using the dropout method, and rank genes accordingly.
4. ```evaluate``` R script can be used to evaluate the prediction of a certain classifier and obtain scores such as accuracy, median F1-score and % unlabeld cells.

For more details, please check function documentations.

### General Usage

To benchmark and fairly evaluate the performance of different classifiers using benchmark-datasets (Filtered datasets can be downloaded from https://zenodo.org/record/3357167), apply the following steps:

#### Step 1

Apply the ```Cross_Validation``` R function on the corresponding dataset to obtain fixed training and test cell indices, straitified across different cell types. For example, using the Tabula Muris (TM) dataset

```R
Cross_Validation('~/TM/Labels.csv', 1, '~/TM/')
```

This command will create a ```CV_folds.RData``` file used as input in Step 2.

#### Step 2

Run each classifier wrapper. For example, running scPred on TM dataset

```R
run_scPred('~/TM/Filtered_TM_data.csv','~/TM/Labels.csv','~/TM/CV_folds.RData','~/Results/TM/')
```

This command will output the true and predicted cell labels as csv files, as well as the classifier computation time.

#### Step 3

Evaluate the classifier prediction by 

```R
result <- evaluate('~/Results/TM/scPred_True_Labels.csv', '~/Results/TM/scPred_Pred_Labels.csv')
```

This command will return the corresponding accuracy, median F1-score, F1-scores for all cell populations, % unlabeled cells, and confusion matrix.

### Usage with feature selection

#### Step 1

Apply the ```Cross_Validation``` R function on the corresponding dataset to obtain fixed training and test cell indices, straitified across different cell types. For example, using the Tabula Muris (TM) dataset

```R
Cross_Validation('~/TM/Labels.csv', 1, '~/TM/')
```

This command will create a ```CV_folds.RData``` file used as input in Step 2 and 3.

#### Step 2

Apply the ```rank_gene_dropouts``` Python script to get the genes ranking for each training fold using the dropout criteria

```
rank_gene_dropouts('~/TM/Filtered_TM_data.csv', '~/TM/CV_folds.RData', '~/TM/')
```

This command will create a ```rank_genes_dropouts.csv``` file used as input in Step 3.

#### Step 3

Run each classifier wrapper. For example, running scPred on TM dataset with 1000 genes

```R
run_scPred('~/TM/Filtered_TM_data.csv','~/TM/Labels.csv','~/TM/CV_folds.RData','~/Results/TM/',
GeneOrderPath = '~/TM/rank_genes_dropouts.csv',NumGenes = 1000)
```

This command will output the true and predicted cell labels as csv files, as well as the classifier computation time.

#### Step 4

Evaluate the classifier prediction by 

```R
result <- evaluate('~/Results/TM/scPred_True_Labels.csv', '~/Results/TM/scPred_Pred_Labels.csv')
```

This command will return the corresponding accuracy, median F1-score, F1-scores for all cell populations, % unlabeled cells, and confusion matrix.

### Evaluate Marker-based methods using DE genes

To evaluate the marker-based methods SCINA, DigitalCellSorter and Garnett using DE genes learned from the data, you may follow these steps:

#### Step 1

Apply the ```Cross_Validation``` R function on the corresponding dataset to obtain fixed training and test cell indices, straitified across different cell types. For example, using the Zheng_sorted dataset

```R
Cross_Validation('~/TM/Labels.csv', 1, '~/Zheng_sorted/')
```

This command will create a ```CV_folds.RData``` file used as input in Step 2 and 3.

#### Step 2

For each fold use the training data to get the DE genes using the ```DEgenesMAST``` R function, and pass these DE genes to the corresponding method, for example here we use SCINA, to obtain cell prediction for the test data.

```R
load('CV_folds.RData')
Data <- read.csv('~/Zheng_sorted/Filtered_DownSampled_SortedPBMC_data',row.names = 1)
Labels <- as.matrix(read.csv('~/Zheng_sorted/Labels.csv'))
Labels <- as.vector(Labels[,col_Index])
Data <- Data[Cells_to_Keep,]
Labels <- Labels[Cells_to_Keep]

for (i in c(1:n_folds))
{
    MarkerGenes <-  DEgenesMAST(t(Data[Train_Idx[[i]],]), Labels[Train_Idx[[i]]], Normalize = TRUE, LogTransform = TRUE)
    ## write the MarkerGenes into a marker genes file format, depending on the tested method, for example for SCINA
    write.csv(MarkerGenes, 'MarkerGenes.csv')
    ## run the SCINA wrapper using these DE marker genes
    run_SCINA(Data[Test_Idx[[i]],], Labels[Test_Idx[[i]]], 'MarkerGenes.csv', '~/Results/Zheng_sorted/')
}
```

### Snakemake

To support future extension of this benchmarking work with new classifiers and datasets, we provide a Snakemake workflow to automate the performed benchmarking analyses (https://github.com/tabdelaal/scRNAseq_Benchmark/tree/snakemake_and_docker).


================================================
FILE: Scripts/run_ACTINN.py
================================================
import os 
import numpy as np
import pandas as pd
import time as tm
import rpy2.robjects as robjects

def run_ACTINN(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    '''
    run ACTINN
    Wrapper script to run ACTINN on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''
    
    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1 
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
    
    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep]
    
    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
    
    # folder with results
    os.chdir(OutputDir)
    
    tot=[]
    truelab = []
    pred = []

    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
    
        train=data.iloc[train_ind_i]
        test=data.iloc[test_ind_i]
        y_train=labels.iloc[train_ind_i]
        y_test=labels.iloc[test_ind_i]
        
        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes,i]
            train = train.iloc[:,feat_to_use]
            test = test.iloc[:,feat_to_use]
        
        train = train.transpose()
        test = test.transpose()
        
        train.to_csv("train.csv")
        test.to_csv("test.csv")
        y_train.to_csv("train_lab.csv", header = False, index = True, sep = '\t')
        y_test.to_csv("test_lab.csv", header = False, index = True, sep = '\t')
        
        tm.sleep(60)
            
        os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_format.py -i train.csv -o train -f csv")
        os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_format.py -i test.csv -o test -f csv")
        
        start = tm.time()
        os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_predict.py -trs train.h5 -trl train_lab.csv -ts test.h5")    
        tot.append(tm.time()-start)
        
        tm.sleep(60)

        truelab.extend(y_test.values)
        predlabels = pd.read_csv('predicted_label.txt',header=0,index_col=None, sep='\t', usecols = [1])            
        pred.extend(predlabels.values)
    
            
    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)
    tot_time = pd.DataFrame(tot)
    
    if (NumGenes == 0):  
        truelab.to_csv("ACTINN_True_Labels.csv", index = False)
        pred.to_csv("ACTINN_Pred_Labels.csv", index = False)
        tot_time.to_csv("ACTINN_Total_Time.csv", index = False)
    else:
        truelab.to_csv("ACTINN_" + str(NumGenes) + "_True_Labels.csv", index = False)
        pred.to_csv("ACTINN_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
        tot_time.to_csv("ACTINN_" + str(NumGenes) + "_Total_Time.csv", index = False)
        
        
================================================
FILE: Scripts/run_CHETAH.R
================================================
run_CHETAH<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
  "
  run CHETAH
  Wrapper script to run CHETAH on a benchmark dataset with 5-fold cross validation,
  outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
  Parameters
  ----------
  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
  as row names and gene names as column names.
  LabelsPath : Cell population annotations file path (.csv).
  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
  OutputDir : Output directory defining the path of the exported file.
  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
  defining the genes order for each cross validation fold, default is NULL.
  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
  "
  
  Data <- read.csv(DataPath,row.names = 1)
  Labels <- as.matrix(read.csv(LabelsPath))
  load(CV_RDataPath)
  Labels <- as.vector(Labels[,col_Index])
  Data <- Data[Cells_to_Keep,]
  Labels <- Labels[Cells_to_Keep]
  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
    GenesOrder = read.csv(GeneOrderPath)
  }
  
  #############################################################################
  #                                CHETAH                                     #
  #############################################################################
  library(CHETAH)
  library(SingleCellExperiment)
  True_Labels_CHETAH <- list()
  Pred_Labels_CHETAH <- list()
  Total_Time_CHETAH <- list()
  Data = t(as.matrix(Data))
  
  for (i in c(1:n_folds)){
    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
      sce <- SingleCellExperiment(assays = list(counts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), 
                                  colData = data.frame(celltypes = Labels[Train_Idx[[i]]]))
      
      sce_test <- SingleCellExperiment(assays = list(counts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), 
                                       colData = data.frame(celltypes = Labels[Test_Idx[[i]]]))
      start_time <- Sys.time()
      sce_test <- CHETAHclassifier(input = sce_test, ref_cells = sce, n_genes = NumGenes)
      end_time <- Sys.time()
    }
    else{
      sce <- SingleCellExperiment(assays = list(counts = Data[,Train_Idx[[i]]]), 
                                  colData = data.frame(celltypes = Labels[Train_Idx[[i]]]))
      
      sce_test <- SingleCellExperiment(assays = list(counts = Data[,Test_Idx[[i]]]), 
                                       colData = data.frame(celltypes = Labels[Test_Idx[[i]]]))
      start_time <- Sys.time()
      sce_test <- CHETAHclassifier(input = sce_test, ref_cells = sce)
      end_time <- Sys.time()
    }
    
    Total_Time_CHETAH[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    True_Labels_CHETAH[i] <- list(Labels[Test_Idx[[i]]])
    Pred_Labels_CHETAH[i] <- list(sce_test$celltype_CHETAH)
  }
  True_Labels_CHETAH <- as.vector(unlist(True_Labels_CHETAH))
  Pred_Labels_CHETAH <- as.vector(unlist(Pred_Labels_CHETAH))
  Total_Time_CHETAH <- as.vector(unlist(Total_Time_CHETAH))
  
  setwd(OutputDir)
  
  if (!is.null(GeneOrderPath) & !is.null (NumGenes)){
    write.csv(True_Labels_CHETAH,paste('CHETAH_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
    write.csv(Pred_Labels_CHETAH,paste('CHETAH_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
    write.csv(Total_Time_CHETAH,paste('CHETAH_',NumGenes,'_Total_Time.csv', sep = ''),row.names = FALSE)
  }
  else{
    write.csv(True_Labels_CHETAH,'CHETAH_True_Labels.csv',row.names = FALSE)
    write.csv(Pred_Labels_CHETAH,'CHETAH_Pred_Labels.csv',row.names = FALSE)
    write.csv(Total_Time_CHETAH,'CHETAH_Total_Time.csv',row.names = FALSE)
  }
}


================================================
FILE: Scripts/run_CaSTLe.R
================================================
run_CaSTLe<-function(DataPath,LabelsPath,CV_RDataPath, OutputDir, GeneOrderPath = NULL, NumGenes = NULL){
  "
  run CaSTLe
  Wrapper script to run CaSTLe on a benchmark dataset with 5-fold cross validation,
  outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
  Parameters
  ----------
  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
  as row names and gene names as column names.
  LabelsPath : Cell population annotations file path (.csv).
  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
  OutputDir : Output directory defining the path of the exported file.
  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
  defining the genes order for each cross validation fold, default is NULL.
  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
  "
  
  Data <- read.csv(DataPath,row.names = 1)
  Labels <- as.matrix(read.csv(LabelsPath))
  load(CV_RDataPath)
  Labels <- as.vector(Labels[,col_Index])
  Data <- Data[Cells_to_Keep,]
  Labels <- Labels[Cells_to_Keep]
  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
    GenesOrder = read.csv(GeneOrderPath)
  }
  
  #############################################################################
  #                                CaSTLe                                     #
  #############################################################################
  library(igraph)
  library(xgboost)
  True_Labels_Castle <- list()
  Pred_Labels_Castle <- list()
  Training_Time_Castle <- list()
  Testing_Time_Castle <- list()
  
  BREAKS=c(-1, 0, 1, 6, Inf)
  nFeatures = 100
  
  for(i in c(1:n_folds)){
    # 1. Load datasets
    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
      ds1 = Data[Train_Idx[[i]],as.vector(GenesOrder[c(1:NumGenes),i])+1]
      ds2 = Data[Test_Idx[[i]],as.vector(GenesOrder[c(1:NumGenes),i])+1]
    }
    else{
      ds1 = Data[Train_Idx[[i]],]
      ds2 = Data[Test_Idx[[i]],]
    }
    
    sourceCellTypes = as.factor(Labels[Train_Idx[[i]]])
    targetCellTypes = as.factor(Labels[Test_Idx[[i]]])
    
    start_time <- Sys.time()
    # 2. Unify sets, excluding low expressed genes
    source_n_cells_counts = apply(ds1, 2, function(x) { sum(x > 0) } )
    target_n_cells_counts = apply(ds2, 2, function(x) { sum(x > 0) } )
    common_genes = intersect( colnames(ds1)[source_n_cells_counts>10], 
                              colnames(ds2)[target_n_cells_counts>10])
    remove(source_n_cells_counts, target_n_cells_counts)
    ds1 = ds1[, colnames(ds1) %in% common_genes]
    ds2 = ds2[, colnames(ds2) %in% common_genes]
    ds = rbind(ds1[,common_genes], ds2[,common_genes])
    isSource = c(rep(TRUE,nrow(ds1)), rep(FALSE,nrow(ds2)))
    remove(ds1, ds2)
    
    # 3. Highest mean in both source and target
    topFeaturesAvg = colnames(ds)[order(apply(ds, 2, mean), decreasing = T)]
    end_time <- Sys.time()
    Training_Time_Castle[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    start_time <- Sys.time()
    # for each cell - what is the most probable classification?
    L = length(levels(sourceCellTypes))
    targetClassification = as.data.frame(matrix(rep(0,L*sum(!isSource)), nrow=L), row.names = levels(sourceCellTypes))
    
    for (cellType in levels(sourceCellTypes)) {
      
      inSourceCellType = as.factor(ifelse(sourceCellTypes == cellType, cellType, paste0("NOT",cellType)))
      
      # 4. Highest mutual information in source
      topFeaturesMi = names(sort(apply(ds[isSource,],2,function(x) { compare(cut(x,breaks=BREAKS),inSourceCellType,method = "nmi") }), decreasing = T))
      
      # 5. Top n genes that appear in both mi and avg
      selectedFeatures = union(head(topFeaturesAvg, nFeatures) , head(topFeaturesMi, nFeatures) )
      
      # 6. remove correlated features
      tmp = cor(ds[,selectedFeatures], method = "pearson")
      tmp[!lower.tri(tmp)] = 0
      selectedFeatures = selectedFeatures[apply(tmp,2,function(x) any(x < 0.9))]
      remove(tmp)
      
      # 7,8. Convert data from continous to binned dummy vars
      # break datasets to bins
      dsBins = apply(ds[, selectedFeatures], 2, cut, breaks= BREAKS)
      # use only bins with more than one value
      nUniq = apply(dsBins, 2, function(x) { length(unique(x)) })
      # convert to dummy vars
      ds0 = model.matrix(~ . , as.data.frame(dsBins[,nUniq>1]))
      remove(dsBins, nUniq)
      
      cat(paste0("<h2>Classifier for ",cellType,"</h2>"))
      
      inTypeSource = sourceCellTypes == cellType
      # 9. Classify
      xg=xgboost(data=ds0[isSource,] , 
                 label=inTypeSource,
                 objective="binary:logistic", 
                 eta=0.7 , nthread=1, nround=20, verbose=0,
                 gamma=0.001, max_depth=5, min_child_weight=10)
      
      # 10. Predict
      inTypeProb = predict(xg, ds0[!isSource, ])
      
      targetClassification[cellType,] = inTypeProb
    }
    end_time <- Sys.time()
    Testing_Time_Castle[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    True_Labels_Castle[i] <- list(Labels[Test_Idx[[i]]])
    Pred_Labels_Castle[i] <- list(rownames(targetClassification)[apply(targetClassification,2,which.max)])
  }
  True_Labels_Castle <- as.vector(unlist(True_Labels_Castle))
  Pred_Labels_Castle <- as.vector(unlist(Pred_Labels_Castle))
  Training_Time_Castle <- as.vector(unlist(Training_Time_Castle))
  Testing_Time_Castle <- as.vector(unlist(Testing_Time_Castle))
  
  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
    write.csv(True_Labels_Castle,paste('True_Labels_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)
    write.csv(Pred_Labels_Castle,paste('Pred_Labels_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)
    write.csv(Training_Time_Castle,paste('Training_Time_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)
    write.csv(Testing_Time_Castle,paste('Testing_Time_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)
  }
  else{
    write.csv(True_Labels_Castle,'True_Labels_CaSTLe.csv',row.names = FALSE)
    write.csv(Pred_Labels_Castle,'Pred_Labels_CaSTLe.csv',row.names = FALSE)
    write.csv(Training_Time_Castle,'Training_Time_CaSTLe.csv',row.names = FALSE)
    write.csv(Testing_Time_Castle,'Testing_Time_CaSTLe.csv',row.names = FALSE)
  }
  
}

================================================
FILE: Scripts/run_Cell_BLAST.py
================================================
import os
import time as tm
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

import tensorflow as tf
tf.logging.set_verbosity(0)

import Cell_BLAST as cb
import numpy as np
from numpy import genfromtxt as gft
import rpy2.robjects as robjects


def run_Cell_BLAST(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    '''
    run Cell_BLAST
    Wrapper script to run Cell_BLAST on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''
        
    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1 
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])
    
    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')

    # read the data and labels
    data_old = cb.data.ExprDataSet.read_table(DataPath,orientation="cg", sep=",", index_col = 0, header = 0, sparsify = True).normalize()
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
    
    data = cb.data.ExprDataSet(data_old.exprs[tokeep],data_old.obs.iloc[tokeep],data_old.var,data_old.uns)

    labels = gft(LabelsPath, dtype = "str", skip_header = 1, delimiter = ",", usecols = col)      
    labels = labels[tokeep]

    os.chdir(OutputDir)
    
    truelab = []
    pred = []
    tr_time = []
    ts_time = []
    
    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1

        train=data[train_ind_i,:]
        test=data[test_ind_i,:]
        y_train = labels[train_ind_i]
        y_test = labels[test_ind_i]
        
        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes,i]
            train = train[:,feat_to_use]
            test = test[:,feat_to_use]

        
        train.obs['cell_type'] = y_train
                
        start = tm.time()
                
        # reduce dimensions
        num_epoch = 50
        models = []
    
        for j in range(4):
            models.append(cb.directi.fit_DIRECTi(train, epoch=num_epoch, patience=10, random_seed = j, path="%d" % j))
    
        # train model
        blast = cb.blast.BLAST(models, train).build_empirical()
        tr_time.append(tm.time()-start)
        
        # predict labels
        start = tm.time()
        test_pred = blast.query(test).annotate('cell_type')
        ts_time.append(tm.time()-start)

        truelab.extend(y_test)
        pred.extend(test_pred.values)
    
    #write results    
    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)
            
    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)
    
    if (NumGenes == 0):  
        truelab.to_csv("Cell_BLAST_True_Labels.csv", index = False)
        pred.to_csv("Cell_BLAST_Pred_Labels.csv", index = False)
        tr_time.to_csv("Cell_BLAST_Training_Time.csv", index = False)
        ts_time.to_csv("Cell_BLAST_Testing_Time.csv", index = False)
    else:
        truelab.to_csv("Cell_BLAST_" + str(NumGenes) + "_True_Labels.csv", index = False)
        pred.to_csv("Cell_BLAST_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
        tr_time.to_csv("Cell_BLAST_" + str(NumGenes) + "_Training_Time.csv", index = False)
        ts_time.to_csv("Cell_BLAST_" + str(NumGenes) + "_Testing_Time.csv", index = False)
        

================================================
FILE: Scripts/run_DigitalCellSorter.py
================================================
import numpy as np
import pandas as pd
import scripts.DigitalCellSorter as DigitalCellSorter
import os
import time as tm
import rpy2.robjects as robjects

def run_DigitalCellSorter(DataPath, LabelsPath, CV_RDataPath, GeneListPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    '''
    run DigitalCellSorter
    Wrapper script to run DigitalCellSorter on a benchmark dataset using a predefined genelist,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.  
  
    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    GeneListPath : Data file path to the genest.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''
        
    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1
    
    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    data = data.iloc[tokeep]
    
    truelab = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
    truelab = truelab.iloc[tokeep]


    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
        feat_to_use = features.iloc[0:NumGenes,0]
        data = data.iloc[:,feat_to_use]
        
    data = data.transpose()
    
    # number of different cell types in the data?
    n_clusters = 8
    AvailableCPUsCount = 1
    N_samples_for_distribution = 10000
        
    start = tm.time()
    pred = DigitalCellSorter.DigitalCellSorter().Process(data, 'DigitalCellSorter_Zhang', 
                                                saveDir = OutputDir, 
                                                geneListFileName = GeneListPath,
                                                N_samples_for_distribution = N_samples_for_distribution,
                                                AvailableCPUsCount = AvailableCPUsCount,
                                                clusterIndex=None,
                                                clusterName=None,
                                                n_clusters=n_clusters)	
    runtime = tm.time() - start 
    
    os.chdir(OutputDir)
    
    results = pd.read_excel('DigitalCellSorter_Zhang_voting.xlsx',header=0,index_col=None, usecols=[11])

    prediction = np.zeros(np.shape(pred), dtype='>U10')
    
    for i in range(len(results)):
    	prediction[np.where(pred == i)] = results.values[i]
    
    prediction = pd.DataFrame(prediction)
        
    if (NumGenes == 0):  
        truelab.to_csv("DigitalCellSorter_True_Labels.csv", index = False)
        prediction.to_csv("DigitalCellSorter_Pred_Labels.csv", index = False)
        with open("DigitalCellSorter_Total_Time.csv", 'w') as f:
            f.write("%f\n" % runtime)
    else:
        truelab.to_csv("DigitalCellSorter_" + str(NumGenes) + "_True_Labels.csv", index = False)
        prediction.to_csv("DigitalCellSorter_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
        with open("DigitalCellSorter_" + str(NumGenes) + "_Total_Time.csv", 'w') as f:
            f.write("%f\n" % runtime)

            
================================================
FILE: Scripts/run_Garnett_CV.R
================================================
run_Garnett_CV <- function(DataPath, LabelsPath, CV_RDataPath, GenesPath, MarkerPath, OutputDir, Human){
  "
  run Garnett
  Wrapper script to run Garnett on a benchmark dataset with 5-fold cross validation,
  outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
  Parameters
  ----------
  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
  as row names and gene names as column names.
  LabelsPath : Cell population annotations file path (.csv).
  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
  GenesPath : Path to the file with the genenames
  MarkerPath : Path to the file with marker genes
  OutputDir : Output directory defining the path of the exported file.
  Human : boolean indicating whether the dataset is human (TRUE) or mouse (FALSE)
  "

  # load needed libraries
  library(garnett)
  if (Human) {
    library(org.Hs.eg.db)
  } else {
    library(org.Mm.eg.db)
  }
  
  # load the CVFile
  load(CV_RDataPath)
  
  # read the labels
  labels <- as.matrix(read.csv(LabelsPath))
  labels <- as.vector(labels[,col_Index])
  labels <- labels[Cells_to_Keep]
  
  # read the data
  mat <- read.table(DataPath, sep = ",")
  data <- mat[-1,-1]
  data <- data[Cells_to_Keep,]
  data <- t(data) #ensure that the genes are rows, and the cells are columns
  
  cells <- mat[-1,1]
  cells <- cells[Cells_to_Keep]
  
  # read the genefile 
  fdata <- read.table(GenesPath)
  names(fdata) <- 'gene_short_name'
  row.names(fdata) <- fdata$gene_short_name
  fd <- new("AnnotatedDataFrame", data = fdata)
  
  true_labels <- list()
  pred_labels <- list()
  train_time <- list()
  test_time <- list()
  
  for (i in c(1:n_folds)){
    lab_train = labels[Train_Idx[[i]]]
    lab_test = labels[Test_Idx[[i]]]
    
    train = data[,Train_Idx[[i]]]
    test = data[,Test_Idx[[i]]]
    
    cells_train = cells[Train_Idx[[i]]]
    cells_test = cells[Test_Idx[[i]]]
    
    pdata_train = data.frame(cells_train)
    pdata_test = data.frame(cells_test)
    
    row.names(train) <- row.names(fdata)
    row.names(test) <- row.names(fdata)
    colnames(train) <- row.names(pdata_train)
    colnames(test) <- row.names(pdata_test)
    
    pd_train <- new("AnnotatedDataFrame", data = pdata_train)
    pd_test <- new("AnnotatedDataFrame", data = pdata_test)
    
    pbmc_cds_train <- newCellDataSet(as(train, "dgCMatrix"), phenoData = pd_train, featureData = fd)
    pbmc_cds_test <- newCellDataSet(as(test, "dgCMatrix"), phenoData = pd_test, featureData = fd)
    
    pbmc_cds_train <- estimateSizeFactors(pbmc_cds_train)
    pbmc_cds_test <- estimateSizeFactors(pbmc_cds_test)
    
    # training
    start_train <- Sys.time()
    
    if (Human){
      pbmc_classifier <- train_cell_classifier(cds = pbmc_cds_train, 
                                               marker_file = MarkerPath,
                                               db=org.Hs.eg.db,
                                               cds_gene_id_type = "SYMBOL",
                                               num_unknown = 50,
                                               marker_file_gene_id_type = "SYMBOL")
    } else {
      pbmc_classifier <- train_cell_classifier(cds = pbmc_cds_train, 
                                               marker_file = MarkerPath,
                                               db=org.Mm.eg.db,
                                               cds_gene_id_type = "SYMBOL",
                                               num_unknown = 50,
                                               marker_file_gene_id_type = "SYMBOL")
      
    }
    end_train <- Sys.time()
    train_time[i] <- as.numeric(end_train - start_train)
    
    # testing
    start_test <- Sys.time()
    
    if (Human) {
      pbmc_cds_test <- classify_cells(pbmc_cds_test, 
                                      pbmc_classifier, 
                                      db = org.Hs.eg.db, 
                                      cluster_extend = TRUE,
                                      cds_gene_id_type = "SYMBOL")
    } else {
      pbmc_cds_test <- classify_cells(pbmc_cds_test, 
                                      pbmc_classifier, 
                                      db = org.Mm.eg.db, 
                                      cluster_extend = TRUE,
                                      cds_gene_id_type = "SYMBOL")
    }
    end_test <- Sys.time()
    test_time[i] <- as.numeric(end_test - start_test)
    
    true_labels[i] <- list(lab_test)
    pred_labels[i] <- list(pData(pbmc_cds_test)$cluster_ext_type)
    
    
  }
  
  true_labels <- as.vector(unlist(true_labels))
  pred_labels <- as.vector(unlist(pred_labels))
  train_time <- as.vector(unlist(train_time))
  test_time <- as.vector(unlist(test_time))
  
  setwd(OutputDir)
  
  write.csv(train_time,'Garnett_CV_Testing_Time.csv',row.names = FALSE)
  write.csv(test_time,'Garnett_CV_Training_Time.csv',row.names = FALSE)
  write.csv(true_labels, 'Garnett_CV_True_Labels.csv', row.names = FALSE)
  write.csv(pred_labels, 'Garnett_CV_Pred_Labels.csv', row.names = FALSE)
  
  
}

================================================
FILE: Scripts/run_Garnett_Pretrained.R
================================================
run_Garnett_Pretrained <- function(DataPath, LabelsPath, GenesPath, CV_RDataPath, ClassifierPath, OutputDir, Human){
  "
  run Garnett
  Wrapper script to run Garnett on a benchmark dataset with a pretrained classifier,
  outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
  Parameters
  ----------
  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
  as row names and gene names as column names.
  LabelsPath : Cell population annotations file path (.csv).
  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
  GenesPath : Path to the file with the genenames
  ClassifierPath : Path to the pretrained classifier
  OutputDir : Output directory defining the path of the exported file.
  Human : boolean indicating whether the dataset is human (TRUE) or mouse (FALSE)
  "
  # load needed libraries
  library(garnett)
  
  if (Human) {
    library(org.Hs.eg.db)
  } else {
    library(org.Mm.eg.db)
  }
  
  # load data, genes, and marker file
  load(CV_RDataPath)
  
  load(ClassifierPath)
  
  labels <- as.matrix(read.csv(LabelsPath))
  labels <- labels[Cells_to_Keep]
  
  mat <- read.table(DataPath, sep = ",")
  data <- mat[-1,-1]
  data <- data[Cells_to_Keep,]
  data <- t(data) #ensure that the genes are rows, and the cells are columns
  
  barcodes <- mat[-1,1]
  
  pdata = data.frame(barcodes)
  fdata <- read.table(GenesPath)
  names(fdata) <- 'gene_short_name'
  row.names(fdata) <- fdata$gene_short_name
  
  row.names(data) <- row.names(fdata)
  colnames(data) <- row.names(pdata)
  
  pd <- new("AnnotatedDataFrame", data = pdata)
  fd <- new("AnnotatedDataFrame", data = fdata)
  pbmc_cds <- newCellDataSet(as(data, "dgCMatrix"),
                             phenoData = pd,
                             featureData = fd)
  
  start_time <- Sys.time()
  
  pbmc_cds <- estimateSizeFactors(pbmc_cds)
  
  if (Human){
    pbmc_cds <- classify_cells(pbmc_cds, hsPBMC, db = org.Hs.eg.db, cluster_extend = TRUE, cds_gene_id_type = "SYMBOL")
  } else {
    pbmc_cds <- classify_cells(pbmc_cds, mmLung, db = org.Mm.eg.db, cluster_extend = TRUE, cds_gene_id_type = "SYMBOL")
  }
  
  end_time <- Sys.time()
  
  test_time <- as.numeric(end_time - start_time)
  
  setwd(OutputDir)
  
  write.table(pData(pbmc_cds)$cluster_ext_type, file = "Garnett_Pred_Labels.csv", append = FALSE, quote = TRUE, sep = "\t",
              eol = "\n", na = "NA", dec = ".", row.names = FALSE,
              qmethod = c("escape", "double"),
              fileEncoding = "")
  
  write.csv(labels,"Garnett_Pretrained_True_Labels.csv", row.names = FALSE)
  
  write.csv(test_time,'Garnett_Pretrained_Testing_Time.csv',row.names = FALSE)
  
  
}

================================================
FILE: Scripts/run_LAmbDA.py
================================================
# -*- coding: utf-8 -*-
"""
Created on Thu May 23 13:51:15 2019

@author: Lieke
"""

import os 
import numpy as np
import pandas as pd
import time as tm
import rpy2.robjects as robjects
import tensorflow as tf
import math
import scipy.io as sio
import optunity as opt
from tensorflow.contrib.tensor_forest.python import tensor_forest
from tensorflow.python.ops import resources


def run_LAmbDA(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    '''
    run LAmbDA classifier
    Wrapper script to run LAmbDA on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''
        
    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1 
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
    
    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep]
    
    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
    
    # folder with results
    os.chdir(OutputDir)
                
    tr_time=[]
    ts_time=[]
    truelab = np.zeros([len(labels),1],dtype = int)
    predlab = np.zeros([len(labels),1],dtype = int)
        
    for i in range(np.squeeze(nfolds)):
        global X, Y, Gnp, Dnp, train, test, prt, cv
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
                
        X = np.array(data) 
        if (NumGenes > 0):
            X = np.log2(X/10+1)
            feat_to_use = features.iloc[0:NumGenes,i]
            X = X[:,feat_to_use]
        else:
            X = np.log2(np.transpose(select_feats(np.transpose(X),0.5,80))/10+1)
    
        uniq = np.unique(labels)
        Y = np.zeros([len(labels),len(uniq)],int)
        
        for j in range(len(uniq)):
            Y[np.where(labels == uniq[j])[0],j] = 1
    
        Y = np.array(Y)
        
        Gnp = np.zeros([len(uniq),len(uniq)],int)
        np.fill_diagonal(Gnp,1)
        Gnp = np.array(Gnp)
        
        Dnp = np.ones([len(uniq),1],int)
        Dnp = np.array(Dnp)
        
        train_samp = int(np.floor(0.75*len(train_ind_i)))
        test_samp = len(train_ind_i) - train_samp
        perm = np.random.permutation(len(train_ind_i))
        train = perm[0:train_samp]
        test = perm[train_samp:test_samp+1]
        
        while(np.sum(np.sum(Y[train,:],0)<5)>0):
            perm = np.random.permutation(X.shape[0])
            train = perm[0:train_samp+1]
            test = perm[train_samp+1:train_samp+test_samp+1]
        
        cv = i
        optunity_it = 0
        prt = False
        opt_params = None
                    
        start=tm.time()
        opt_params, _, _ = opt.minimize(run_LAmbDA2,solver_name='sobol', gamma=[0.8,1.2], delta=[0.05,0.95], tau=[10.0,11.0], prc_cut=[20,50], bs_prc=[0.2,0.6], num_trees=[10,200], max_nodes=[100,1000], num_evals=50)
        tr_time.append(tm.time()-start)
        
        print("Finished training!")
        
        prt = True
        train = train_ind_i
        test = test_ind_i
        
        start=tm.time()
        err = run_LAmbDA2(opt_params['gamma'], opt_params['delta'], opt_params['tau'], opt_params['prc_cut'], opt_params['bs_prc'], opt_params['num_trees'], opt_params['max_nodes'])
        ts_time.append(tm.time()-start)
        
        tf.reset_default_graph();
        
        predfile = 'preds_cv' + str(cv) + '.mat'
        truefile = 'truth_cv' + str(cv) + '.mat'
        pred = sio.loadmat(predfile)
        truth = sio.loadmat(truefile)
        
        pred = pred['preds']
        truth = truth['labels']
        
        pred_ind = np.argmax(pred,axis=1)
        truth_ind = np.argmax(truth,axis=1)
        
        predlab[test_ind_i,0] = pred_ind
        truelab[test_ind_i,0] = truth_ind
            
                
    truelab = pd.DataFrame(truelab)
    predlab = pd.DataFrame(predlab)
        
    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)
        
    if (NumGenes == 0):  
        truelab.to_csv("LAmbDA_True_Labels.csv", index = False)
        predlab.to_csv("LAmbDA_Pred_Labels.csv", index = False)
        tr_time.to_csv("LAmbDA_Training_Time.csv", index = False)
        ts_time.to_csv("LAmbDA_Testing_Time.csv", index = False)
    else:
        truelab.to_csv("LAmbDA_" + str(NumGenes) + "_True_Labels.csv", index = False)
        predlab.to_csv("LAmbDA_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
        tr_time.to_csv("LAmbDA_" + str(NumGenes) + "_Training_Time.csv", index = False)
        ts_time.to_csv("LAmbDA_" + str(NumGenes) + "_Testing_Time.csv", index = False)


##### Functions copied from LAmbDA's Github
def wt_cutoff(colnum,cutoff,Gtmp,gamma):
	rowsums = np.sum(Gtmp,axis=1);
	return(math.ceil(cutoff*(math.log((max(rowsums)/rowsums[colnum])+1,2)**gamma)))

def resample(prc_cut,Y,Gtmp,train,gamma):
	add = list()
	rem = list()
	colsums = np.sum(Y[train,:],axis=0);
	cutoff = math.ceil(np.percentile(colsums,prc_cut));
	for i in range(len(colsums)):
		if colsums[i] == 0:
			pass
		elif colsums[i] < wt_cutoff(i,cutoff,Gtmp,gamma):
			idx = np.squeeze(np.array(np.where(Y[train,i]>=1)));
			choice = np.random.choice(train[idx],int(wt_cutoff(i,cutoff,Gtmp,gamma)-colsums[i]))
			add = add + choice.tolist();
		elif colsums[i] == wt_cutoff(i,cutoff,Gtmp,gamma):
			pass
		else:
			idx = np.squeeze(np.array(np.where(Y[train,i]>=1)));
			choice = np.random.choice(train[idx],int(colsums[i]-wt_cutoff(i,cutoff,Gtmp,gamma)),replace=False)
			rem = rem + choice.tolist()
	return np.concatenate((list([val for val in train if val not in rem]),add));

def select_feats(Xtmp,num_zero_prc_cut,var_prc_cut):
	#*********************************************************************
	# remove features with many zeros
	num_feat_zeros = np.sum(Xtmp==0,axis=1);
	Xtmp = Xtmp[num_feat_zeros<num_zero_prc_cut*Xtmp.shape[1],:]
	#*********************************************************************
	# remove features with low variance
	feat_vars = np.var(Xtmp,axis=1)
	Xtmp = Xtmp[feat_vars>np.percentile(feat_vars,var_prc_cut),:]
	return(Xtmp)

def get_yn(predict,ys,delta,tau,output_feats):
	D = tf.cast(Dnp, tf.float32);
	G = tf.cast(Gnp, tf.float32);
	ys = tf.cast(ys, tf.float32);
	#print("start")
	Cm = tf.matmul(tf.transpose(tf.matmul(ys,D)),predict+0.1)/tf.reshape(tf.reduce_sum(tf.transpose(tf.matmul(ys,D)),1),(-1,1));
	#print("1")
	mCm = tf.reshape(tf.reduce_mean(tf.cast(tf.matmul(tf.transpose(D),G)>0,tf.float32)*Cm,1),(-1,1));
	#print("2")
	yw = tf.multiply(predict+0.1,tf.matmul(tf.matmul(ys,D),tf.pow(mCm/Cm,tau)));
	#print("3")
	ye = tf.multiply(tf.matmul(ys,G),yw);
	#print("4")
	yt = tf.matmul(ys,tf.matmul(tf.transpose(ys),ye));
	#print("5")
	ya = (delta*yt)+((1-delta)*ye)
	#print("6")
	yn = tf.cast(tf.one_hot(tf.argmax(ya,axis=1),output_feats), dtype=tf.float32)
	#print("7")
	return(yn)

def get_yi(rowsums,G2,ys):
	G2 = tf.cast(G2, tf.float32);
	ys = tf.cast(ys, tf.float32);
	yi = tf.cast(tf.matmul(ys,G2), dtype=tf.float32);
	return(yi)

def run_LAmbDA2(gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes):
	global X, Y, Gnp, Dnp, train, test, prt, cv
	D = tf.cast(Dnp, tf.float32);
	G = tf.cast(Gnp, tf.float32);
	#optunity_it = optunity_it+1;
	num_trees = int(num_trees);
	max_nodes = int(max_nodes);
	prc_cut = int(np.ceil(prc_cut));
	print("gamma=%.4f, delta=%.4f, tau=%.4f, prc_cut=%i, bs_prc=%.4f, num_trees=%i, max_nodes=%i" % (gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes))
	input_feats = X.shape[1];
	num_labls = G.shape.as_list();
	output_feats = num_labls[1];
	#print(output_feats)
	num_labls = num_labls[0];
	rowsums = np.sum(Gnp,axis=1);
	train2 = resample(prc_cut, Y, Gnp, train, gamma);				# Bug??
	bs = int(np.ceil(bs_prc*train2.size))
	xs = tf.placeholder(tf.float32, [None,input_feats])
	#ys = tf.placeholder(tf.float32, [None,num_labls])
	yin = tf.placeholder(tf.int32, [None])
	print("Vars loaded xs and ys created")
	hparams = tensor_forest.ForestHParams(num_classes=output_feats,
									num_features=input_feats,
									num_trees=num_trees,
									max_nodes=max_nodes).fill()
	print("Tensor forest hparams created")								
	forest_graph = tensor_forest.RandomForestGraphs(hparams)
	print("Tensor forest graph created")
	train_op = forest_graph.training_graph(xs, yin)
	loss_op = forest_graph.training_loss(xs, yin)
	print("Loss and train ops created")
	predict, _, _ = forest_graph.inference_graph(xs)
	print("Tensor forest variables created through predict")
	accuracy_op = tf.reduce_mean(tf.reduce_sum(tf.square(tf.one_hot(yin,output_feats)-predict),reduction_indices=[1]))
	print(tf.reduce_sum(tf.square(tf.one_hot(yin,output_feats)-predict),reduction_indices=[1]))
	#predict = tf.one_hot(pred);
	print("Lambda specific variables created")
	# Creating training and testing steps
	G2 = np.copy(Gnp);
	G2[rowsums>1,:] = 0;
	YI = np.matmul(Y,G2);
	YIrs = np.sum(YI,axis=1);
	trainI = train2[np.in1d(train2,np.where(YIrs==1))];
	print("data type trainI,",trainI.dtype)
	testI = test[np.in1d(test,np.where(YIrs==1))];
	print("trainI testI created")
	#init_vars=tf.global_variables_initializer()
	init_vars = tf.group(tf.global_variables_initializer(),
	resources.initialize_resources(resources.shared_resources()))
	sess = tf.Session()
	sess.run(init_vars)
	print("Session started")
	#beep = sess.run(predict,feed_dict={xs:X[1:100,:]});
	#beep = sess.run(predict,feed_dict={xs:X[train2[0:bs],:]});
	tensor_trainI = {xs: X[trainI, :], yin: sess.run(tf.argmax(get_yi(rowsums,G2,Y[trainI, :]),axis=1))}
	print("tensor_trainI made")
	tensor_testI = {xs: X[testI, :], yin: sess.run(tf.argmax(get_yi(rowsums,G2,Y[testI, :]),axis=1))}
	print("tensor_testI made")
	tensor_train = {xs: X[train2[0:bs], :], yin: sess.run(tf.argmax(get_yn(sess.run(predict,feed_dict={xs:X[train2[0:bs],:]}),Y[train2[0:bs], :],delta,tau,output_feats),axis=1))}
	print("tensor_train made")
	tensor_test = {xs: X[test, :], yin: sess.run(tf.argmax(get_yn(sess.run(predict,feed_dict={xs:X[test,:]}),Y[test, :],delta,tau,output_feats),axis=1))}
	print("tensor_test made")
	#**********************************
	#print("Loss and training steps created with sample tensors")
	# Setting params and initializing
	print("Beginning iterations")
	# Starting training iterations
	print(X.shape)
	for i in range(1,101):
		if i < 50:
			sess.run(train_op, feed_dict=tensor_trainI)
			#print("ran train op")
			if i % 10 == 0:
				print(str(sess.run(accuracy_op, feed_dict=tensor_trainI)) + ' ' + str(sess.run(accuracy_op, feed_dict=tensor_testI)))
		else:
			sess.run(train_op, feed_dict=tensor_train)
			if i % 10 == 0:
				print(str(sess.run(accuracy_op, feed_dict=tensor_train)) + ' ' + str(sess.run(accuracy_op, feed_dict=tensor_test)))
			elif i % 10 == 0:
				np.random_shuffle(train2);
				tensor_train = {xs: X[train2[0:bs], :], yin: sess.run(get_yn(sess.run(predict,feed_dict={xs:X[train2[0:bs],:]}),Y[train2[0:bs], :],delta,tau,output_feats))}
	if prt:
		blah = sess.run(predict, feed_dict=tensor_test);
		sio.savemat('preds_cv' + str(cv) + '.mat', {'preds': blah});
		sio.savemat('truth_cv' + str(cv) + '.mat', {'labels': Y[test, :]});
	acc = sess.run(accuracy_op, feed_dict=tensor_test) 
	print("loss1=%.4f, gamma=%.4f, delta=%.4f, tau=%.4f, prc_cut=%i, bs_prc=%.4f, num_trees=%i, max_nodes=%i" % (acc, gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes))
	tf.reset_default_graph();
	return(acc)


================================================
FILE: Scripts/run_LDA.py
================================================
import os 
import numpy as np
import pandas as pd
import time as tm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import rpy2.robjects as robjects


def run_LDA(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    '''
    run baseline classifier: LDA
    Wrapper script to run LDA classifier on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''
        
    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1 
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
    
    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep]
    
    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
    
    # folder with results
    os.chdir(OutputDir)
    
    # normalize data
    data = np.log1p(data)
        
    Classifier = LinearDiscriminantAnalysis()
            
    tr_time=[]
    ts_time=[]
    truelab = []
    pred = []
        
    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
    
        train=data.iloc[train_ind_i]
        test=data.iloc[test_ind_i]
        y_train=labels.iloc[train_ind_i]
        y_test=labels.iloc[test_ind_i]
            
        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes,i]
            train = train.iloc[:,feat_to_use]
            test = test.iloc[:,feat_to_use]
            
        start=tm.time()
        Classifier.fit(train, y_train)
        tr_time.append(tm.time()-start)
                    
        start=tm.time()
        predicted = Classifier.predict(test)
        ts_time.append(tm.time()-start)
            
        truelab.extend(y_test.values)
        pred.extend(predicted)
                
    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)
        
    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)
        
    if (NumGenes == 0):  
        truelab.to_csv("LDA_True_Labels.csv", index = False)
        pred.to_csv("LDA_Pred_Labels.csv", index = False)
        tr_time.to_csv("LDA_Training_Time.csv", index = False)
        ts_time.to_csv("LDA_Testing_Time.csv", index = False)
    else:
        truelab.to_csv("LDA_" + str(NumGenes) + "_True_Labels.csv", index = False)
        pred.to_csv("LDA_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
        tr_time.to_csv("LDA_" + str(NumGenes) + "_Training_Time.csv", index = False)
        ts_time.to_csv("LDA_" + str(NumGenes) + "_Testing_Time.csv", index = False)

    
================================================
FILE: Scripts/run_LDA_rejection.py
================================================
import os 
import numpy as np
import pandas as pd
import time as tm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import rpy2.robjects as robjects


def run_LDA_rejection(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0, Threshold = 0.7):
    '''
    run baseline classifier: LDA
    Wrapper script to run LDA classifier on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    Threshold : Threshold used when rejecting the genes, default is 0.7.
    '''
        
    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1 
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
    
    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep]
    
    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
    
    # folder with results
    os.chdir(OutputDir)
    
    # normalize data
    data = np.log1p(data)
        
    Classifier = LinearDiscriminantAnalysis()
            
    tr_time=[]
    ts_time=[]
    truelab = []
    pred = []
        
    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
    
        train=data.iloc[train_ind_i]
        test=data.iloc[test_ind_i]
        y_train=labels.iloc[train_ind_i]
        y_test=labels.iloc[test_ind_i]
            
        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes,i]
            train = train.iloc[:,feat_to_use]
            test = test.iloc[:,feat_to_use]
            
        start=tm.time()
        Classifier.fit(train, y_train)
        tr_time.append(tm.time()-start)
                    
        start=tm.time()
        predicted = Classifier.predict(test)
        prob = np.max(Classifier.predict_proba(test), axis = 1)
        unlabeled = np.where(prob < Threshold)
        predicted[unlabeled] = 'Unknown'
        ts_time.append(tm.time()-start)
            
        truelab.extend(y_test.values)
        pred.extend(predicted)
                
    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)
        
    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)
        
    if (NumGenes == 0):  
        truelab.to_csv("LDA_True_Labels.csv", index = False)
        pred.to_csv("LDA_Pred_Labels.csv", index = False)
        tr_time.to_csv("LDA_Training_Time.csv", index = False)
        ts_time.to_csv("LDA_Testing_Time.csv", index = False)
    else:
        truelab.to_csv("LDA_" + str(NumGenes) + "_True_Labels.csv", index = False)
        pred.to_csv("LDA_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
        tr_time.to_csv("LDA_" + str(NumGenes) + "_Training_Time.csv", index = False)
        ts_time.to_csv("LDA_" + str(NumGenes) + "_Testing_Time.csv", index = False)

    
================================================
FILE: Scripts/run_NMC.py
================================================
import os 
import numpy as np
import pandas as pd
import time as tm
from sklearn.neighbors import NearestCentroid
import rpy2.robjects as robjects


def run_NMC(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    '''
    run baseline classifier: NMC
    Wrapper script to run a NMC classifier on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''
        
    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1 
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
    
    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep]
    
    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
    
    # folder with results
    os.chdir(OutputDir)
    
    # normalize data
    data = np.log1p(data)
        
    Classifier = NearestCentroid()
            
    tr_time=[]
    ts_time=[]
    truelab = []
    pred = []
        
    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
    
        train=data.iloc[train_ind_i]
        test=data.iloc[test_ind_i]
        y_train=labels.iloc[train_ind_i]
        y_test=labels.iloc[test_ind_i]
            
        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes,i]
            train = train.iloc[:,feat_to_use]
            test = test.iloc[:,feat_to_use]
            
        start=tm.time()
        Classifier.fit(train, y_train)
        tr_time.append(tm.time()-start)
                    
        start=tm.time()
        predicted = Classifier.predict(test)
        ts_time.append(tm.time()-start)
            
        truelab.extend(y_test.values)
        pred.extend(predicted)
                
    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)
        
    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)
        
    if (NumGenes == 0):  
        truelab.to_csv("NMC_True_Labels.csv", index = False)
        pred.to_csv("NMC_Pred_Labels.csv", index = False)
        tr_time.to_csv("NMC_Training_Time.csv", index = False)
        ts_time.to_csv("NMC_Testing_Time.csv", index = False)
    else:
        truelab.to_csv("NMC_" + str(NumGenes) + "_True_Labels.csv", index = False)
        pred.to_csv("NMC_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
        tr_time.to_csv("NMC_" + str(NumGenes) + "_Training_Time.csv", index = False)
        ts_time.to_csv("NMC_" + str(NumGenes) + "_Testing_Time.csv", index = False)

    
================================================
FILE: Scripts/run_RF.py
================================================
import os 
import numpy as np
import pandas as pd
import time as tm
from sklearn.ensemble import RandomForestClassifier
import rpy2.robjects as robjects


def run_RF(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    '''
    run baseline classifier: RF
    Wrapper script to run a RF classifier with 50 trees on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''
        
    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1 
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
    
    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep]
    
    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
    
    # folder with results
    os.chdir(OutputDir)
    
    # normalize data
    data = np.log1p(data)
        
    Classifier = RandomForestClassifier(n_estimators = 50)
            
    tr_time=[]
    ts_time=[]
    truelab = []
    pred = []
        
    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
    
        train=data.iloc[train_ind_i]
        test=data.iloc[test_ind_i]
        y_train=labels.iloc[train_ind_i]
        y_test=labels.iloc[test_ind_i]
            
        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes,i]
            train = train.iloc[:,feat_to_use]
            test = test.iloc[:,feat_to_use]
            
        start=tm.time()
        Classifier.fit(train, y_train)
        tr_time.append(tm.time()-start)
                    
        start=tm.time()
        predicted = Classifier.predict(test)
        ts_time.append(tm.time()-start)
            
        truelab.extend(y_test.values)
        pred.extend(predicted)
                
    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)
        
    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)
        
    if (NumGenes == 0):  
        truelab.to_csv("RF_True_Labels.csv", index = False)
        pred.to_csv("RF_Pred_Labels.csv", index = False)
        tr_time.to_csv("RF_Training_Time.csv", index = False)
        ts_time.to_csv("RF_Testing_Time.csv", index = False)
    else:
        truelab.to_csv("RF_" + str(NumGenes) + "_True_Labels.csv", index = False)
        pred.to_csv("RF_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
        tr_time.to_csv("RF_" + str(NumGenes) + "_Training_Time.csv", index = False)
        ts_time.to_csv("RF_" + str(NumGenes) + "_Testing_Time.csv", index = False)

    
================================================
FILE: Scripts/run_SCINA.R
================================================
run_SCINA<-function(DataPath,LabelsPath,GeneSigPath,OutputDir){
  "
  run SCINA
  Wrapper script to run SCINA on a benchmark dataset,
  outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
  Parameters
  ----------
  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
  as row names and gene names as column names.
  LabelsPath : Cell population annotations file path (.csv).
  GeneSigPath : Cell type marker genes file path (.csv)
  OutputDir : Output directory defining the path of the exported file.
  "
  
  Data <- read.csv(DataPath,row.names = 1)
  Labels <- as.vector(as.matrix(read.csv(LabelsPath)))
  Data <- Data[is.element(Labels,c('CD14+ Monocyte','CD19+ B','CD56+ NK')),]
  Labels <- Labels[is.element(Labels,c('CD14+ Monocyte','CD19+ B','CD56+ NK'))]
  Labels[Labels == 'CD14+ Monocyte'] <- 'CD14_Monocyte'
  Labels[Labels == 'CD19+ B'] <- 'CD19_B'
  Labels[Labels == 'CD56+ NK'] <- 'CD56_NK'
  
  
  #############################################################################
  #                                 SCINA                                     #
  #############################################################################
  library(SCINA)
  Signature_Genes <- preprocess.signatures(GeneSigPath)
  True_Labels_SCINA <- list()
  Pred_Labels_SCINA <- list()
  Total_Time_SCINA <- list()
  
  library(preprocessCore)
  Data = t(as.matrix(Data))
  Data=log(Data+1)
  Data[]=normalize.quantiles(Data)
  
  start_time <- Sys.time()
  results = SCINA(Data, Signature_Genes)
  end_time <- Sys.time()
  
  True_Labels_SCINA <- Labels
  Pred_Labels_SCINA <- results$cell_labels
  Total_Time_SCINA <- as.numeric(difftime(end_time,start_time,units = 'secs'))
  
  setwd(OutputDir)
  
  write.csv(True_Labels_SCINA,'SCINA_True_Labels.csv',row.names = FALSE)
  write.csv(Pred_Labels_SCINA,'SCINA_Pred_Labels.csv',row.names = FALSE)
  write.csv(Total_Time_SCINA,'SCINA_Total_Time.csv',row.names = FALSE)
}


================================================
FILE: Scripts/run_SVM.py
================================================
import os 
import numpy as np
import pandas as pd
import time as tm
from sklearn.svm import LinearSVC
import rpy2.robjects as robjects


def run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    '''
    run baseline classifier: SVM
    Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''
        
    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1 
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
    
    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep]
    
    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
    
    # folder with results
    os.chdir(OutputDir)
    
    # normalize data
    data = np.log1p(data)
        
    Classifier = LinearSVC()
            
    tr_time=[]
    ts_time=[]
    truelab = []
    pred = []
        
    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
    
        train=data.iloc[train_ind_i]
        test=data.iloc[test_ind_i]
        y_train=labels.iloc[train_ind_i]
        y_test=labels.iloc[test_ind_i]
            
        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes,i]
            train = train.iloc[:,feat_to_use]
            test = test.iloc[:,feat_to_use]
            
        start=tm.time()
        Classifier.fit(train, y_train)
        tr_time.append(tm.time()-start)
                    
        start=tm.time()
        predicted = Classifier.predict(test)
        ts_time.append(tm.time()-start)
            
        truelab.extend(y_test.values)
        pred.extend(predicted)
                
    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)
        
    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)
        
    if (NumGenes == 0):  
        truelab.to_csv("SVM_True_Labels.csv", index = False)
        pred.to_csv("SVM_Pred_Labels.csv", index = False)
        tr_time.to_csv("SVM_Training_Time.csv", index = False)
        ts_time.to_csv("SVM_Testing_Time.csv", index = False)
    else:
        truelab.to_csv("SVM_" + str(NumGenes) + "_True_Labels.csv", index = False)
        pred.to_csv("SVM_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
        tr_time.to_csv("SVM_" + str(NumGenes) + "_Training_Time.csv", index = False)
        ts_time.to_csv("SVM_" + str(NumGenes) + "_Testing_Time.csv", index = False)

    
================================================
FILE: Scripts/run_SVM_rejection.py
================================================
import os 
import numpy as np
import pandas as pd
import time as tm
from sklearn.svm import LinearSVC
import rpy2.robjects as robjects
from sklearn.calibration import CalibratedClassifierCV


def run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0, Threshold = 0.7):
    '''
    run baseline classifier: SVM
    Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    Threshold : Threshold used when rejecting the cells, default is 0.7.

    '''
        
    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1 
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
    
    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep]
    
    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
    
    # folder with results
    os.chdir(OutputDir)
    
    # normalize data
    data = np.log1p(data)
        
    Classifier = LinearSVC()
    clf = CalibratedClassifierCV(Classifier)
            
    tr_time=[]
    ts_time=[]
    truelab = []
    pred = []
        
    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
    
        train=data.iloc[train_ind_i]
        test=data.iloc[test_ind_i]
        y_train=labels.iloc[train_ind_i]
        y_test=labels.iloc[test_ind_i]
            
        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes,i]
            train = train.iloc[:,feat_to_use]
            test = test.iloc[:,feat_to_use]
            
        start=tm.time()
        clf.fit(train, y_train)
        tr_time.append(tm.time()-start)
                    
        start=tm.time()
        predicted = clf.predict(test)
        prob = np.max(clf.predict_proba(test), axis = 1)
        unlabeled = np.where(prob < Threshold)
        predicted[unlabeled] = 'Unknown'
        ts_time.append(tm.time()-start)
            
        truelab.extend(y_test.values)
        pred.extend(predicted)
                
    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)
        
    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)
        
    if (NumGenes == 0):  
        truelab.to_csv("SVM_True_Labels.csv", index = False)
        pred.to_csv("SVM_Pred_Labels.csv", index = False)
        tr_time.to_csv("SVM_Training_Time.csv", index = False)
        ts_time.to_csv("SVM_Testing_Time.csv", index = False)
    else:
        truelab.to_csv("SVM_" + str(NumGenes) + "_True_Labels.csv", index = False)
        pred.to_csv("SVM_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
        tr_time.to_csv("SVM_" + str(NumGenes) + "_Training_Time.csv", index = False)
        ts_time.to_csv("SVM_" + str(NumGenes) + "_Testing_Time.csv", index = False)

    
================================================
FILE: Scripts/run_SingleR.R
================================================
run_SingleR<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
  "
  run SingleR
  Wrapper script to run SingleR on a benchmark dataset with 5-fold cross validation,
  outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
  Parameters
  ----------
  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
  as row names and gene names as column names.
  LabelsPath : Cell population annotations file path (.csv).
  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
  OutputDir : Output directory defining the path of the exported file.
  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
  defining the genes order for each cross validation fold, default is NULL.
  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
  "
  
  Data <- read.csv(DataPath,row.names = 1)
  Labels <- as.matrix(read.csv(LabelsPath))
  load(CV_RDataPath)
  Labels <- as.vector(Labels[,col_Index])
  Data <- Data[Cells_to_Keep,]
  Labels <- Labels[Cells_to_Keep]
  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
    GenesOrder = read.csv(GeneOrderPath)
  }
  
  #############################################################################
  #                               SingleR                                     #
  #############################################################################
  library(SingleR)
  library(Seurat)
  True_Labels_SingleR <- list()
  Pred_Labels_SingleR <- list()
  Total_Time_SingleR <- list()
  Data = t(as.matrix(Data))
  
  for (i in c(1:n_folds)){
    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
      start_time <- Sys.time()
      singler = SingleR(method = "single", Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]], 
                        Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]], 
                        Labels[Train_Idx[[i]]], numCores = 1)
      end_time <- Sys.time()
    }
    else{
      start_time <- Sys.time()
      singler = SingleR(method = "single", Data[,Test_Idx[[i]]], Data[,Train_Idx[[i]]], Labels[Train_Idx[[i]]], numCores = 1)
      end_time <- Sys.time()
    }
    Total_Time_SingleR[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    True_Labels_SingleR[i] <- list(Labels[Test_Idx[[i]]])
    Pred_Labels_SingleR[i] <- list(as.vector(singler$labels))
  }
  True_Labels_SingleR <- as.vector(unlist(True_Labels_SingleR))
  Pred_Labels_SingleR <- as.vector(unlist(Pred_Labels_SingleR))
  Total_Time_SingleR <- as.vector(unlist(Total_Time_SingleR))
  
  setwd(OutputDir)
  
  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
    write.csv(True_Labels_SingleR,paste('SingleR_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
    write.csv(Pred_Labels_SingleR,paste('SingleR_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
    write.csv(Total_Time_SingleR,paste('SingleR_',NumGenes,'_Total_Time.csv', sep = ''),row.names = FALSE)
  }
  else{
    write.csv(True_Labels_SingleR,'SingleR_True_Labels.csv',row.names = FALSE)
    write.csv(Pred_Labels_SingleR,'SingleR_Pred_Labels.csv',row.names = FALSE)
    write.csv(Total_Time_SingleR,'SingleR_Total_Time.csv',row.names = FALSE)
  }
}


================================================
FILE: Scripts/run_kNN50.py
================================================
import os 
import numpy as np
import pandas as pd
import time as tm
from sklearn.neighbors import KNeighborsClassifier
import rpy2.robjects as robjects


def run_kNN50(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    '''
    run baseline classifiers: kNN
    Wrapper script to run kNN (with k = 50) classifier on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''
        
    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1 
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
    
    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep]
    
    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
    
    # folder with results
    os.chdir(OutputDir)
    
    # normalize data
    data = np.log1p(data)
        
    Classifier = KNeighborsClassifier(n_neighbors=50)
            
    tr_time=[]
    ts_time=[]
    truelab = []
    pred = []
        
    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
    
        train=data.iloc[train_ind_i]
        test=data.iloc[test_ind_i]
        y_train=labels.iloc[train_ind_i]
        y_test=labels.iloc[test_ind_i]
            
        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes,i]
            train = train.iloc[:,feat_to_use]
            test = test.iloc[:,feat_to_use]
            
        start=tm.time()
        Classifier.fit(train, y_train)
        tr_time.append(tm.time()-start)
                    
        start=tm.time()
        predicted = Classifier.predict(test)
        ts_time.append(tm.time()-start)
            
        truelab.extend(y_test.values)
        pred.extend(predicted)
                
    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)
        
    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)
        
    if (NumGenes == 0):  
        truelab.to_csv("kNN50_True_Labels.csv", index = False)
        pred.to_csv("kNN50_Pred_Labels.csv", index = False)
        tr_time.to_csv("kNN50_Training_Time.csv", index = False)
        ts_time.to_csv("kNN50_Testing_Time.csv", index = False)
    else:
        truelab.to_csv("kNN50_" + str(NumGenes) + "_True_Labels.csv", index = False)
        pred.to_csv("kNN50_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
        tr_time.to_csv("kNN50_" + str(NumGenes) + "_Training_Time.csv", index = False)
        ts_time.to_csv("kNN50_" + str(NumGenes) + "_Testing_Time.csv", index = False)

    
================================================
FILE: Scripts/run_kNN9.py
================================================
import os 
import numpy as np
import pandas as pd
import time as tm
from sklearn.neighbors import KNeighborsClassifier
import rpy2.robjects as robjects


def run_kNN9(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    '''
    run baseline classifiers: kNN
    Wrapper script to run kNN (with k = 9) classifier on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''
        
    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1 
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
    
    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep]
    
    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
    
    # folder with results
    os.chdir(OutputDir)
    
    # normalize data
    data = np.log1p(data)
        
    Classifier = KNeighborsClassifier(n_neighbors=9)
            
    tr_time=[]
    ts_time=[]
    truelab = []
    pred = []
        
    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
    
        train=data.iloc[train_ind_i]
        test=data.iloc[test_ind_i]
        y_train=labels.iloc[train_ind_i]
        y_test=labels.iloc[test_ind_i]
            
        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes,i]
            train = train.iloc[:,feat_to_use]
            test = test.iloc[:,feat_to_use]
            
        start=tm.time()
        Classifier.fit(train, y_train)
        tr_time.append(tm.time()-start)
                    
        start=tm.time()
        predicted = Classifier.predict(test)
        ts_time.append(tm.time()-start)
            
        truelab.extend(y_test.values)
        pred.extend(predicted)
                
    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)
        
    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)
        
    if (NumGenes == 0):  
        truelab.to_csv("kNN9_True_Labels.csv", index = False)
        pred.to_csv("kNN9_Pred_Labels.csv", index = False)
        tr_time.to_csv("kNN9_Training_Time.csv", index = False)
        ts_time.to_csv("kNN9_Testing_Time.csv", index = False)
    else:
        truelab.to_csv("kNN9_" + str(NumGenes) + "_True_Labels.csv", index = False)
        pred.to_csv("kNN9_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
        tr_time.to_csv("kNN9_" + str(NumGenes) + "_Training_Time.csv", index = False)
        ts_time.to_csv("kNN9_" + str(NumGenes) + "_Testing_Time.csv", index = False)

    
================================================
FILE: Scripts/run_moana.py
================================================
import os
import pandas as pd
import numpy as np
from moana.core import ExpMatrix
from moana.classify import CellTypeClassifier
import time as tm
import rpy2.robjects as robjects

def run_moana(DataPath, LabelsPath, ClassifierPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    '''
    run moana
    Wrapper script to run moana on a benchmark dataset with a pretrained classifier,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.  
  
    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    ClassifierPath : Data file path to the pretrained classifier.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''
    
#    # read the Rdata file
#    robjects.r['load'](CV_RDataPath)
#
#    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
#    col = np.array(robjects.r['col_Index'], dtype = 'int')
#    col = col - 1
    
    matrix = ExpMatrix.read_tsv(DataPath, sep = ',')    
#    matrix = matrix.iloc[tokeep] 
    
    truelab = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',')
#    truelab = truelab.iloc[tokeep]
    
    ct_old = ['CD19+ B','CD14+ Monocyte','CD4+/CD45RA+/CD25- Naive T','CD4+/CD45RO+ Memory','CD8+/CD45RA+ Naive Cytotoxic','Dendritic', 'CD56+ NK']
    ct_new = ['B cells','CD14+ monocytes','Naive CD4+ T cells','Memory CD4+ T cells','Naive CD8+ T cells','Dendritic cells','NK cells']
    
    tokeep2 = np.isin(truelab,ct_old)
    truelab = truelab[tokeep2]
    print(len(truelab))
    matrix = matrix.iloc[np.squeeze(tokeep2)]
    
    for i in range(len(ct_old)):
        truelab.iloc[truelab == ct_old[i]] = ct_new[i]
        
    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
        feat_to_use = features.iloc[0:NumGenes,0]
        matrix = matrix.iloc[:,feat_to_use]

    data = ExpMatrix(X = np.transpose(matrix.X), genes = matrix.cells, cells = matrix.genes)
    data.genes.name = 'Genes'
    data.cells.name = 'Cells'
    data.index.name = 'Genes'
    data.columns.name = 'Cells'
    
    clf = CellTypeClassifier.read_pickle(ClassifierPath)
    
    start = tm.time()
    predictions = clf.predict(data)
    runtime = tm.time() - start
    
    np.asarray(predictions)
    
    pred = pd.DataFrame(predictions)
        
    os.chdir(OutputDir)
            
    if (NumGenes == 0):  
        truelab.to_csv("moana_True_Labels.csv", index = False)
        pred.to_csv("moana_Pred_Labels.csv", index = False)
        with open("moana_Total_Time.csv", 'w') as f:
            f.write("%f\n" % runtime)
    else:
        truelab.to_csv("moana_" + str(NumGenes) + "_True_Labels.csv", index = False)
        pred.to_csv("moana_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
        with open("moana_" + str(NumGenes) + "_Total_Time.csv", 'w') as f:
            f.write("%f\n" % runtime)


================================================
FILE: Scripts/run_scID.R
================================================
run_scID<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
  "
  run scID
  Wrapper script to run scID on a benchmark dataset with 5-fold cross validation,
  outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
  Parameters
  ----------
  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
  as row names and gene names as column names.
  LabelsPath : Cell population annotations file path (.csv).
  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
  OutputDir : Output directory defining the path of the exported file.
  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
  defining the genes order for each cross validation fold, default is NULL.
  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
  "
  
  Data <- read.csv(DataPath,row.names = 1)
  Labels <- as.matrix(read.csv(LabelsPath))
  load(CV_RDataPath)
  Labels <- as.vector(Labels[,col_Index])
  Data <- Data[Cells_to_Keep,]
  Labels <- Labels[Cells_to_Keep]
  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
    GenesOrder = read.csv(GeneOrderPath)
  }
  
  #############################################################################
  #                                 scID                                      #
  #############################################################################
  library(scID)
  library(Seurat)
  True_Labels_scID <- list()
  Pred_Labels_scID <- list()
  Total_Time_scID <- list()
  Data = t(as.matrix(Data))
  
  for (i in c(1:n_folds)){
    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
      Train_Labels <- list(Labels[Train_Idx[[i]]])
      names(Train_Labels[[1]]) <- colnames(Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]])
      start_time <- Sys.time()
      scID_output <- scid_multiclass(Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]], 
                                     Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]], 
                                     Train_Labels[[1]])
      end_time <- Sys.time()
    }
    else{
      Train_Labels <- list(Labels[Train_Idx[[i]]])
      names(Train_Labels[[1]]) <- colnames(Data[,Train_Idx[[i]]])
      start_time <- Sys.time()
      scID_output <- scid_multiclass(Data[,Test_Idx[[i]]], Data[,Train_Idx[[i]]], Train_Labels[[1]])
      end_time <- Sys.time()
    }
    Total_Time_scID[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    True_Labels_scID[i] <- list(Labels[Test_Idx[[i]]])
    Pred_Labels_scID[i] <- list(as.vector(scID_output$labels))
  }
  True_Labels_scID <- as.vector(unlist(True_Labels_scID))
  Pred_Labels_scID <- as.vector(unlist(Pred_Labels_scID))
  Total_Time_scID <- as.vector(unlist(Total_Time_scID))
  
  setwd(OutputDir)
  
  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
    write.csv(True_Labels_scID,paste('scID_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
    write.csv(Pred_Labels_scID,paste('scID_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
    write.csv(Total_Time_scID,paste('scID_',NumGenes,'_Total_Time.csv', sep = ''),row.names = FALSE)
  }
  else{
    write.csv(True_Labels_scID,'scID_True_Labels.csv',row.names = FALSE)
    write.csv(Pred_Labels_scID,'scID_Pred_Labels.csv',row.names = FALSE)
    write.csv(Total_Time_scID,'scID_Total_Time.csv',row.names = FALSE)
  }
}


================================================
FILE: Scripts/run_scPred.R
================================================
run_scPred<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
  "
  run scPred
  Wrapper script to run scPred on a benchmark dataset with 5-fold cross validation,
  outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
  Parameters
  ----------
  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
  as row names and gene names as column names.
  LabelsPath : Cell population annotations file path (.csv).
  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
  OutputDir : Output directory defining the path of the exported file.
  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
  defining the genes order for each cross validation fold, default is NULL.
  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
  "
  
  Data <- read.csv(DataPath,row.names = 1)
  Labels <- as.matrix(read.csv(LabelsPath))
  load(CV_RDataPath)
  Labels <- as.vector(Labels[,col_Index])
  Data <- Data[Cells_to_Keep,]
  Labels <- Labels[Cells_to_Keep]
  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
    GenesOrder = read.csv(GeneOrderPath)
  }
  
  #############################################################################
  #                                scPred                                     #
  #############################################################################
  library(scPred)
  library(tidyverse)
  library(SingleCellExperiment)
  True_Labels_scPred <- list()
  Pred_Labels_scPred <- list()
  Training_Time_scPred <- list()
  Testing_Time_scPred <- list()
  Data = t(as.matrix(Data))
  
  for (i in c(1:n_folds)){
    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
      sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), 
                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
      sce_counts <- normcounts(sce)
      sce_cpm <- apply(sce_counts, 2, function(x) (x/sum(x))*1000000)
      sce_metadata <- as.data.frame(colData(sce))
      
      sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), 
                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
      sce_counts_test <- normcounts(sce_test)
      sce_cpm_test <- apply(sce_counts_test, 2, function(x) (x/sum(x))*1000000)
      sce_metadata_test <- as.data.frame(colData(sce_test))
    }
    else{
      sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), 
                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
      sce_counts <- normcounts(sce)
      sce_cpm <- apply(sce_counts, 2, function(x) (x/sum(x))*1000000)
      sce_metadata <- as.data.frame(colData(sce))
      
      sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), 
                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
      sce_counts_test <- normcounts(sce_test)
      sce_cpm_test <- apply(sce_counts_test, 2, function(x) (x/sum(x))*1000000)
      sce_metadata_test <- as.data.frame(colData(sce_test))
    }
    
    
    # scPred Training    
    start_time <- Sys.time()
    set.seed(1234)
    scp <- eigenDecompose(sce_cpm)
    scPred::metadata(scp) <- sce_metadata
    scp <- getFeatureSpace(scp, pVar = 'cell_type1')
    # plotEigen(scp, group = 'cell_type1')
    scp <- trainModel(scp)
    # plotTrainProbs(scp)
    end_time <- Sys.time()
    Training_Time_scPred[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    # scPred Prediction
    start_time <- Sys.time()
    scp <- scPredict(scp,newData = sce_cpm_test)
    end_time <- Sys.time()
    Testing_Time_scPred[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    True_Labels_scPred[i] <- list(Labels[Test_Idx[[i]]])
    Pred_Labels_scPred[i] <- list(getPredictions(scp)$predClass)
  }
  True_Labels_scPred <- as.vector(unlist(True_Labels_scPred))
  Pred_Labels_scPred <- as.vector(unlist(Pred_Labels_scPred))
  Training_Time_scPred <- as.vector(unlist(Training_Time_scPred))
  Testing_Time_scPred <- as.vector(unlist(Testing_Time_scPred))
  
  setwd(OutputDir)
  
  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
    write.csv(True_Labels_scPred,paste('scPred_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
    write.csv(Pred_Labels_scPred,paste('scPred_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
    write.csv(Training_Time_scPred,paste('scPred_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)
    write.csv(Testing_Time_scPred,paste('scPred_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)
  }
  else{
    write.csv(True_Labels_scPred,'scPred_True_Labels.csv',row.names = FALSE)
    write.csv(Pred_Labels_scPred,'scPred_Pred_Labels.csv',row.names = FALSE)
    write.csv(Training_Time_scPred,'scPred_Training_Time.csv',row.names = FALSE)
    write.csv(Testing_Time_scPred,'scPred_Testing_Time.csv',row.names = FALSE)
  }
}


================================================
FILE: Scripts/run_scVI.py
================================================
from scvi.dataset import CsvDataset
import os
import numpy as np
import pandas as pd
from scvi.models import SCANVI
from scvi.inference import SemiSupervisedTrainer
import time as tm
import rpy2.robjects as robjects

def run_scVI(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    '''
    run scVI
    Wrapper script to run scVI on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''
    
    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1 
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)

    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep] 
    
    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
        
    os.chdir(OutputDir)
    
    if (NumGenes == 0):
        #save labels as csv file with header and index column
        labels.to_csv('Labels_scvi.csv')
        data.to_csv('Data_scvi.csv')    
        
        train = CsvDataset('Data_scvi.csv', save_path = "", sep = ",", labels_file = "Labels_scvi.csv", gene_by_cell = False)
        
        ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing
        scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels)
        trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5)
    
    n_epochs = 200
    
    truelab = []
    pred = []
    tr_time = []
    ts_time = []
    
    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
        
        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes,i]
            data2 = data.iloc[:,feat_to_use]
            
            labels.to_csv('Labels_scvi.csv')
            data2.to_csv('Data_scvi.csv')    
            
            train = CsvDataset('Data_scvi.csv', save_path = "", sep = ",", labels_file = "Labels_scvi.csv", gene_by_cell = False, new_n_genes = False)
            
            ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing
            scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels)
            trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5)

        trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(indices=(train_ind_i).ravel(), shuffle = False)
        trainer_scanvi.labelled_set.to_monitor = ['ll','accuracy']
        trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(indices=(test_ind_i).ravel(), shuffle = False)
        trainer_scanvi.unlabelled_set.to_monitor = ['ll','accuracy']
    
        start = tm.time()
        trainer_scanvi.train(n_epochs)
        tr_time.append(tm.time()-start)
    
        ## labels of test set are in y_pred
        ## labels are returned in numbers, should be mapped back to the real labels
        ## indices are permutated
        start = tm.time()
        y_true, y_pred = trainer_scanvi.unlabelled_set.compute_predictions()
        ts_time.append(tm.time()-start)
        
        truelab.extend(y_true)
        pred.extend(y_pred)
    
    #write results
    
    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)
    
    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)

    
    if (NumGenes == 0):  
        truelab.to_csv("scVI_True_Labels.csv", index = False)
        pred.to_csv("scVI_Pred_Labels.csv", index = False)
        tr_time.to_csv("scVI_Training_Time.csv", index = False)
        ts_time.to_csv("scVI_Testing_Time.csv", index = False)
    else:
        truelab.to_csv("scVI_" + str(NumGenes) + "_True_Labels.csv", index = False)
        pred.to_csv("scVI_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
        tr_time.to_csv("scVI_" + str(NumGenes) + "_Training_Time.csv", index = False)
        ts_time.to_csv("scVI_" + str(NumGenes) + "_Testing_Time.csv", index = False)
        

================================================
FILE: Scripts/run_scmap.R
================================================
run_scmap <- function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
  "
  run scmap
  Wrapper script to run scmap on a benchmark dataset with 5-fold cross validation,
  outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
  Parameters
  ----------
  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
  as row names and gene names as column names.
  LabelsPath : Cell population annotations file path (.csv).
  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
  OutputDir : Output directory defining the path of the exported file.
  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
  defining the genes order for each cross validation fold, default is NULL.
  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
  "
  
  Data <- read.csv(DataPath,row.names = 1)
  Labels <- as.matrix(read.csv(LabelsPath))
  load(CV_RDataPath)
  Labels <- as.vector(Labels[,col_Index])
  Data <- Data[Cells_to_Keep,]
  Labels <- Labels[Cells_to_Keep]
  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
    GenesOrder = read.csv(GeneOrderPath)
  }
  
  #############################################################################
  #                                 scmap                                     #
  #############################################################################
  library(scmap)
  library(SingleCellExperiment)
  True_Labels_scmapcluster <- list()
  Pred_Labels_scmapcluster <- list()
  True_Labels_scmapcell <- list()
  Pred_Labels_scmapcell <- list()
  Training_Time_scmapcluster <- list()
  Testing_Time_scmapcluster <- list()
  Training_Time_scmapcell <- list()
  Testing_Time_scmapcell <- list()
  Data = t(as.matrix(Data))
  
  for (i in c(1:n_folds)){
    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
      sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), 
                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
      logcounts(sce) <- log2(normcounts(sce) + 1)
      # use gene names as feature symbols
      rowData(sce)$feature_symbol <- rownames(sce)
      sce <- selectFeatures(sce, n_features = NumGenes, suppress_plot = TRUE)
      
      sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), 
                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
      logcounts(sce_test) <- log2(normcounts(sce_test) + 1)
      rowData(sce_test)$feature_symbol <- rownames(sce_test)
      sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData
    }
    else{
      sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), 
                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
      logcounts(sce) <- log2(normcounts(sce) + 1)
      # use gene names as feature symbols
      rowData(sce)$feature_symbol <- rownames(sce)
      sce <- selectFeatures(sce, suppress_plot = TRUE)
      
      sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), 
                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
      logcounts(sce_test) <- log2(normcounts(sce_test) + 1)
      rowData(sce_test)$feature_symbol <- rownames(sce_test)
      sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData
    }
    
    # scmap-cluster
    start_time <- Sys.time()
    sce <- indexCluster(sce)
    end_time <- Sys.time()
    Training_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    start_time <- Sys.time()
    scmapCluster_results <- scmapCluster(projection = sce_test,index_list = list(metadata(sce)$scmap_cluster_index))
    end_time <- Sys.time()
    Testing_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    True_Labels_scmapcluster[i] <- list(Labels[Test_Idx[[i]]])
    Pred_Labels_scmapcluster[i] <- list(scmapCluster_results$combined_labs)
    
    # scmap-cell
    start_time <- Sys.time()
    set.seed(1)
    sce <- indexCell(sce)
    end_time <- Sys.time()
    Training_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    start_time <- Sys.time()
    scmapCell_results <- scmapCell(sce_test,list(metadata(sce)$scmap_cell_index))
    scmapCell_clusters <- scmapCell2Cluster(scmapCell_results,list(as.character(colData(sce)$cell_type1)))
    end_time <- Sys.time()
    Testing_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    True_Labels_scmapcell[i] <- list(Labels[Test_Idx[[i]]])
    Pred_Labels_scmapcell[i] <- list(scmapCell_clusters$combined_labs)
  }
  
  True_Labels_scmapcluster <- as.vector(unlist(True_Labels_scmapcluster))
  Pred_Labels_scmapcluster <- as.vector(unlist(Pred_Labels_scmapcluster))
  True_Labels_scmapcell <- as.vector(unlist(True_Labels_scmapcell))
  Pred_Labels_scmapcell <- as.vector(unlist(Pred_Labels_scmapcell))
  Training_Time_scmapcluster <- as.vector(unlist(Training_Time_scmapcluster))
  Testing_Time_scmapcluster <- as.vector(unlist(Testing_Time_scmapcluster))
  Training_Time_scmapcell <- as.vector(unlist(Training_Time_scmapcell))
  Testing_Time_scmapcell <- as.vector(unlist(Testing_Time_scmapcell))
  
  setwd(OutputDir)
  
  if (!is.null(GeneOrderPath) & !is.null (NumGenes)){
    write.csv(True_Labels_scmapcluster,paste('scmapcluster_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
    write.csv(Pred_Labels_scmapcluster,paste('scmapcluster_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
    write.csv(True_Labels_scmapcell,paste('scmapcell_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
    write.csv(Pred_Labels_scmapcell,paste('scmapcell_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
    write.csv(Training_Time_scmapcluster,paste('scmapcluster_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)
    write.csv(Testing_Time_scmapcluster,paste('scmapcluster_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)
    write.csv(Training_Time_scmapcell,paste('scmapcell_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)
    write.csv(Testing_Time_scmapcell,paste('scmapcell_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)
  }
  else{
    write.csv(True_Labels_scmapcluster,'scmapcluster_True_Labels.csv',row.names = FALSE)
    write.csv(Pred_Labels_scmapcluster,'scmapcluster_Pred_Labels.csv',row.names = FALSE)
    write.csv(True_Labels_scmapcell,'scmapcell_True_Labels.csv',row.names = FALSE)
    write.csv(Pred_Labels_scmapcell,'scmapcell_Pred_Labels.csv',row.names = FALSE)
    write.csv(Training_Time_scmapcluster,'scmapcluster_Training_Time.csv',row.names = FALSE)
    write.csv(Testing_Time_scmapcluster,'scmapcluster_Testing_Time.csv',row.names = FALSE)
    write.csv(Training_Time_scmapcell,'scmapcell_Training_Time.csv',row.names = FALSE)
    write.csv(Testing_Time_scmapcell,'scmapcell_Testing_Time.csv',row.names = FALSE)
  }
}


================================================
FILE: Scripts/run_singleCellNet.R
================================================
run_singleCellNet<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
  "
  run singleCellNet
  Wrapper script to run singleCellNet on a benchmark dataset with 5-fold cross validation,
  outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
  Parameters
  ----------
  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
  as row names and gene names as column names.
  LabelsPath : Cell population annotations file path (.csv).
  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
  OutputDir : Output directory defining the path of the exported file.
  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
  defining the genes order for each cross validation fold, default is NULL.
  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
  "
  
  Data <- read.csv(DataPath,row.names = 1)
  colnames(Data) <- gsub('_','.',colnames(Data), fixed = TRUE)
  Labels <- as.matrix(read.csv(LabelsPath))
  load(CV_RDataPath)
  Labels <- as.vector(Labels[,col_Index])
  Data <- Data[Cells_to_Keep,]
  Labels <- Labels[Cells_to_Keep]
  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
    GenesOrder = read.csv(GeneOrderPath)
  }
  
  #############################################################################
  #                              singleCellNet                                #
  #############################################################################
  library(singleCellNet)
  library(dplyr)
  True_Labels_singleCellNet <- list()
  Pred_Labels_singleCellNet <- list()
  Training_Time_singleCellNet <- list()
  Testing_Time_singleCellNet <- list()
  Data = t(as.matrix(Data))              # deals also with sparse matrix
  
  for(i in c(1:n_folds)){
    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
      DataTrain <- Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]
      DataTest <- Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]
    }
    else{
      DataTrain <- Data[,Train_Idx[[i]]]
      DataTest <- Data[,Test_Idx[[i]]]
    }
    
    start_time <- Sys.time()
    cgenes2<-findClassyGenes(DataTrain, data.frame(Annotation = Labels[Train_Idx[[i]]]), "Annotation")
    cgenesA<-cgenes2[['cgenes']]
    grps<-cgenes2[['grps']]
    DataTrain<-as.matrix(DataTrain[cgenesA,])
    xpairs<-ptGetTop(DataTrain, grps, ncores = 1)
    pdTrain<-query_transform(DataTrain[cgenesA, ], xpairs)
    rf<-sc_makeClassifier(pdTrain[xpairs,], genes=xpairs, groups=grps)
    end_time <- Sys.time()
    Training_Time_singleCellNet[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    start_time <- Sys.time()
    DataTest<-query_transform(DataTest[cgenesA,], xpairs)
    classRes <-rf_classPredict(rf, DataTest)
    end_time <- Sys.time()
    Testing_Time_singleCellNet[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    True_Labels_singleCellNet[i] <- list(Labels[Test_Idx[[i]]])
    Pred_Labels_singleCellNet[i] <- list((rownames(classRes)[apply(classRes,2,which.max)])[1:length(Test_Idx[[i]])])
  }
  True_Labels_singleCellNet <- as.vector(unlist(True_Labels_singleCellNet))
  Pred_Labels_singleCellNet <- as.vector(unlist(Pred_Labels_singleCellNet))
  Training_Time_singleCellNet <- as.vector(unlist(Training_Time_singleCellNet))
  Testing_Time_singleCellNet <- as.vector(unlist(Testing_Time_singleCellNet))
  
  setwd(OutputDir)
  
  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
    write.csv(True_Labels_singleCellNet,paste('singleCellNet_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
    write.csv(Pred_Labels_singleCellNet,paste('singleCellNet_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
    write.csv(Training_Time_singleCellNet,paste('singleCellNet_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)
    write.csv(Testing_Time_singleCellNet,paste('singleCellNet_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)
  }
  else{
    write.csv(True_Labels_singleCellNet,'singleCellNet_True_Labels.csv',row.names = FALSE)
    write.csv(Pred_Labels_singleCellNet,'singleCellNet_Pred_Labels.csv',row.names = FALSE)
    write.csv(Training_Time_singleCellNet,'singleCellNet_Training_Time.csv',row.names = FALSE)
    write.csv(Testing_Time_singleCellNet,'singleCellNet_Testing_Time.csv',row.names = FALSE)
  }
}


================================================
FILE: Snakemake/Cross_Validation.R
================================================
args <- commandArgs(TRUE)

Cross_Validation <- function(LabelsPath, col_Index = 1, OutputDir){
  "
  Cross_Validation
  Function returns train and test indices for 5 folds stratified across unique cell populations,
  also filter out cell populations with less than 10 cells.
  It return a 'CV_folds.RData' file which then used as input to classifiers wrappers.

  Parameters
  ----------
  LabelsPath : Cell population annotations file path (.csv).
  col_Index : column index (integer) defining which level of annotation to use,
  in case of multiple cell type annotations (default is 1)
  OutputDir : Output directory defining the path of the exported file.
  "

  Labels <- as.matrix(read.csv(LabelsPath))
  Labels <- as.vector(Labels[,col_Index])

  Removed_classes <- !(table(Labels) > 10)
  Cells_to_Keep <- !(is.element(Labels,names(Removed_classes)[Removed_classes]))
  Labels <- Labels[Cells_to_Keep]

  # Getting training and testing Folds
  library(rBayesianOptimization)
  n_folds = 5
  Folds <- KFold(Labels,nfolds = n_folds, stratified = TRUE)
  Test_Folds <- c(n_folds:1)
  Train_Idx <- list()
  Test_Idx <- list()
  for (i in c(1:length(Folds))){
    Temp_Folds <- Folds
    Temp_Folds[Test_Folds[i]] <- NULL
    Train_Idx[i] <- list(unlist(Temp_Folds))
    Test_Idx[i] <- Folds[Test_Folds[i]]
  }
  remove(Temp_Folds,i,Folds)
  save(n_folds,Train_Idx,Test_Idx,col_Index,Cells_to_Keep,file = paste0(OutputDir, '/CV_folds.RData'))
}

Cross_Validation(args[1], as.numeric(args[2]), args[3])


================================================
FILE: Snakemake/DEgenesMAST.R
================================================
DEgenesMAST <- function(Data, Labels, Normalize = FALSE, LogTransform = FALSE){
  # This functions applies a differential expression test to the data using one vs all
  # The training data should be used a an input
  # The output is a matrix with marker genes where the columns are the cell populations and the rows are the top20 marker genes
  # This output can be rewritten to the format of the prior-knowledge-supervised classifiers and afterwards be used to classify the test set.
  
  # Data: genes X cells (rows = genes, columns = cells)
  # Labels: labels of the data
  # Normalize: the input for MAST should be cpm normalized data, 
  #            if the data is not normalized yet, this should be set to TRUE
  # LogTransform: the input for MAST should be logtransformed,
  #            if the data is not logtransformed yet, this should be set to TRUE
  
  
  library(Seurat)
  
  if(Normalize)
  {
    Data <- apply(Data, 2, function(x) (x/sum(x))*1000000)
  }
  
  if(LogTransform)
  {
    Data <- log(Data+1, base = 2)
  }
  SeuObj <- CreateSeuratObject(raw.data = Data, project = "DEgenes")
  SeuObj <- SetIdent(SeuObj, ident.use = Labels)
  DEgenes <- FindAllMarkers(SeuObj, test.use = "MAST")
  Markers <- matrix(nrow = 20,ncol = length(unique(Labels)))
  colnames(Markers) <- unique(Labels)
  for (i in unique(Labels)){
    i
    TempList <- DEgenes$gene[((DEgenes$cluster == i) & (DEgenes$avg_logFC > 0))]
    MarkerGenes <- DEgenes$p_val_adj[DEgenes$cluster == i]
    print(MarkerGenes[1:20])
    if (length(TempList) >= 20){
      Markers[,i] <- TempList[1:20]
    }
    else{
      if(length(TempList) > 0){
        Markers[c(1:length(TempList)),i] <- TempList
      }
    }
  }
  return(Markers)
}


================================================
FILE: Snakemake/Dockerfiles/baseline/Dockerfile
================================================
FROM debian:9.9-slim

# Install newest R version
RUN apt-get update && \
    apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
    wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
    echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
    apt-get update && \
    apt-get install --no-install-recommends --yes r-base && \
    apt-get purge --yes wget gnupg apt-transport-https && \
    apt-get autoremove --yes && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

# Install python
RUN apt-get update && \
    apt-get install --no-install-recommends --yes python3 python3-pip && \
    pip3 --no-cache-dir install setuptools && \
    pip3 --no-cache-dir install pandas rpy2 scikit-learn statsmodels && \
    rm -rf /var/lib/apt/lists/*

COPY Scripts/run_kNN50.py \
     Scripts/run_kNN9.py \
     Scripts/run_LDA.py \
     Scripts/run_LDA_rejection.py \
     Scripts/run_NMC.py \
     Scripts/run_RF.py \
     Scripts/run_SVM.py \
     Scripts/run_SVM_rejection.py \
     rank_gene_dropouts.py \
     /Scripts/


================================================
FILE: Snakemake/Dockerfiles/cell_blast/Dockerfile
================================================
FROM python:3.7-slim-stretch

# Install newest R version
RUN apt-get update && \
    apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
    wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
    echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
    apt-get update && \
    apt-get install --no-install-recommends --yes r-base && \
    apt-get purge --yes wget gnupg apt-transport-https && \
    apt-get autoremove --yes && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

# Install python and pip deps
RUN apt-get update && apt-get install --no-install-recommends --yes build-essential python3-dev libxml2 libxml2-dev zlib1g-dev && \
    pip3 --no-cache-dir install --upgrade pip && \
    pip3 --no-cache-dir install --upgrade setuptools && \
    pip3 --no-cache-dir install pandas rpy2 scikit-learn statsmodels tensorflow Cell-BLAST && \
    apt-get autoremove --yes && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

COPY Scripts/run_Cell_BLAST.py /Scripts/


================================================
FILE: Snakemake/Dockerfiles/chetah/Dockerfile
================================================
FROM debian:9.9-slim

# Install newest R version
RUN apt-get update && \
    apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
    wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
    echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
    apt-get update && \
    apt-get install --no-install-recommends --yes r-base && \
    apt-get purge --yes wget gnupg apt-transport-https && \
    apt-get autoremove --yes && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

COPY Scripts/run_CHETAH.R \
     Dockerfiles/chetah/install_packages.R \
     /Scripts/

# Install R packages
RUN apt-get update && \
    apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
    Rscript --vanilla /Scripts/install_packages.R && \
    apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
    apt-get autoremove --yes && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*


================================================
FILE: Snakemake/Dockerfiles/chetah/install_packages.R
================================================
withCallingHandlers({
  install.packages("devtools", repos="https://cloud.r-project.org/")
  install.packages("BiocManager", repos="https://cloud.r-project.org/")
  BiocManager::install(c("bioDist", "ggplot2", "gplots", "cowplot",
                         "dendextend", "corrplot", "reshape2", "plotly"))
  devtools::install_github("jdekanter/CHETAH", ref="b777e6f671bff3c434842adb655869a52bc9e368")
},
warning = function(w) stop(w))


================================================
FILE: Snakemake/Dockerfiles/cross_validation/Dockerfile
================================================
FROM debian:9.9-slim

# Install newest R version
RUN apt-get update && \
    apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
    wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
    echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
    apt-get update && \
    apt-get install --no-install-recommends --yes r-base && \
    apt-get purge --yes wget gnupg apt-transport-https && \
    apt-get autoremove --yes && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

COPY Cross_Validation.R \
     Dockerfiles/cross_validation/install_packages.R \
     /Scripts/

# Install R packages
RUN apt-get update && \
    apt-get install --no-install-recommends --yes make gcc libc6-dev g++ libxml2-dev && \
    Rscript --vanilla /Scripts/install_packages.R && \
    apt-get purge --yes make gcc g++ libxml2-dev && \
    apt-get autoremove --yes && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*


================================================
FILE: Snakemake/Dockerfiles/cross_validation/install_packages.R
================================================
withCallingHandlers({
  install.packages("lhs", repos="https://cloud.r-project.org/")
  install.packages("rBayesianOptimization", repos="https://cloud.r-project.org/")
},
warning = function(w) stop(w))


================================================
FILE: Snakemake/Dockerfiles/garnett/Dockerfile
================================================
FROM debian:9.9-slim

# Install newest R version
RUN apt-get update && \
    apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
    wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
    echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
    apt-get update && \
    apt-get install --no-install-recommends --yes r-base && \
    apt-get purge --yes wget gnupg apt-transport-https && \
    apt-get autoremove --yes && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

COPY Scripts/run_Garnett_CV.R \
     Scripts/run_Garnett_Pretrained.R \
     Dockerfiles/garnett/install_packages.R \
     /Scripts/

# Install R packages
RUN apt-get update && \
    apt-get install --no-install-recommends --yes make gcc g++ libxml2-dev zlib1g-dev gfortran liblapack-dev libcurl4-gnutls-dev libssl-dev && \
    Rscript --vanilla /Scripts/install_packages.R && \
    apt-get purge --yes make gcc g++ zlib1g-dev gfortran liblapack-dev libcurl4-gnutls-dev libssl-dev && \
    apt-get autoremove --yes && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*


================================================
FILE: Snakemake/Dockerfiles/garnett/install_packages.R
================================================
withCallingHandlers({
  install.packages("BiocManager", repos="https://cloud.r-project.org/")
  BiocManager::install(c("monocle", "DelayedArray", "DelayedMatrixStats",
                       "org.Hs.eg.db", "org.Mm.eg.db"))
  install.packages("devtools", repos="https://cloud.r-project.org/")
  devtools::install_github("cole-trapnell-lab/garnett", ref="9804b532bbcc1714b3ed0b718cf430741f1dba6c")
},
warning = function(w) stop(w))


================================================
FILE: Snakemake/Dockerfiles/scid/Dockerfile
================================================
FROM r-base:3.6.0

COPY Scripts/run_scID.R \
     Dockerfiles/scid/install_packages.R \
     /Scripts/

# Install R packages
RUN apt-get update && \
    apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
    Rscript --vanilla /Scripts/install_packages.R && \
    apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
    apt-get autoremove --yes && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*


================================================
FILE: Snakemake/Dockerfiles/scid/install_packages.R
================================================
withCallingHandlers({
  install.packages("BiocManager", repos="https://cloud.r-project.org/")
  BiocManager::install(ask = FALSE);
  BiocManager::install(c("scater", "MAST"))
  install.packages("devtools", repos="https://cloud.r-project.org/")
  devtools::install_github("satijalab/seurat")
  devtools::install_github("BatadaLab/scID")
},
warning = function(w) stop(w))


================================================
FILE: Snakemake/Dockerfiles/scmap/Dockerfile
================================================
FROM r-base:3.6.0

COPY Scripts/run_scmapcell.R \
     Scripts/run_scmapcluster.R \
     Dockerfiles/scmap/install_packages.R \
     /Scripts/

# Install R packages
RUN apt-get update && \
    apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
    Rscript --vanilla /Scripts/install_packages.R && \
    apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
    apt-get autoremove --yes && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*


================================================
FILE: Snakemake/Dockerfiles/scmap/install_packages.R
================================================
withCallingHandlers({
  install.packages("BiocManager", repos="https://cloud.r-project.org/")
  BiocManager::install(ask = FALSE)
  BiocManager::install("SingleCellExperiment")
  install.packages("devtools", repos="https://cloud.r-project.org/")
  devtools::install_github("hemberg-lab/scmap")
},
warning = function(w) stop(w))


================================================
FILE: Snakemake/Dockerfiles/scvi/Dockerfile
================================================
FROM python:3.7-slim-stretch

# Install newest R version
RUN apt-get update && \
    apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
    wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
    echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
    apt-get update && \
    apt-get install --no-install-recommends --yes r-base && \
    apt-get purge --yes wget gnupg apt-transport-https && \
    apt-get autoremove --yes && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

# Install python and pip deps
RUN apt-get update && apt-get install --no-install-recommends --yes build-essential python3-dev libxml2 libxml2-dev zlib1g-dev && \
    pip3 --no-cache-dir install --upgrade pip && \
    pip3 --no-cache-dir install --upgrade setuptools && \
    pip3 --no-cache-dir install pandas rpy2 scikit-learn statsmodels tensorflow scvi && \
    apt-get autoremove --yes && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*


COPY Scripts/run_scVI.py /Scripts/


================================================
FILE: Snakemake/Dockerfiles/singlecellnet/Dockerfile
================================================
FROM debian:9.9-slim

# Install newest R version
RUN apt-get update && \
    apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
    wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
    echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
    apt-get update && \
    apt-get install --no-install-recommends --yes r-base && \
    apt-get purge --yes wget gnupg apt-transport-https && \
    apt-get autoremove --yes && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

COPY Scripts/run_singleCellNet.R \
     Dockerfiles/singlecellnet/install_packages.R \
     /Scripts/

# Install R packages
RUN apt-get update && \
    apt-get install --no-install-recommends --yes make gcc libc6-dev g++ libcurl4-openssl-dev zlib1g-dev libssl-dev r-base-dev libxml2-dev && \
    Rscript --vanilla /Scripts/install_packages.R && \
    apt-get purge --yes make gcc g++ zlib1g-dev libcurl4-openssl-dev libc6-dev libssl-dev r-base-dev libxml2-dev && \
    apt-get autoremove --yes && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*


================================================
FILE: Snakemake/Dockerfiles/singlecellnet/install_packages.R
================================================
withCallingHandlers({
  install.packages("devtools", repos="https://cloud.r-project.org/")
  install.packages("BiocManager", repos="https://cloud.r-project.org/")
  BiocManager::install("fgsea")
  devtools::install_github("thomasp85/patchwork", ref="fd7958bae3e7a1e30237c751952e412a0a1d1242")
  devtools::install_github("pcahan1/singleCellNet", ref="4279a68112743b783cc82628421dd703261ec117")
},
warning = function(w) stop(w))


================================================
FILE: Snakemake/Dockerfiles/singler/Dockerfile
================================================
FROM debian:9.9-slim

# Install newest R version
RUN apt-get update && \
    apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
    wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
    echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
    apt-get update && \
    apt-get install --no-install-recommends --yes r-base && \
    apt-get purge --yes wget gnupg apt-transport-https && \
    apt-get autoremove --yes && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

COPY Scripts/run_SingleR.R \
     Dockerfiles/singler/install_packages.R \
     /Scripts/

RUN apt-get update && \
    apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev libxml2 && \
    Rscript --vanilla /Scripts/install_packages.R && \
    apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
    apt-get autoremove --yes && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*


================================================
FILE: Snakemake/Dockerfiles/singler/install_packages.R
================================================
withCallingHandlers({
  install.packages("devtools", repos="https://cloud.r-project.org/")
  install.packages("Seurat", repos="https://cloud.r-project.org/")
  devtools::install_github("dviraran/SingleR", ref="db4823b380ba2c3142c857c8c0695200dd1736f6")
},
warning = function(w) stop(w))


================================================
FILE: Snakemake/LICENSE
================================================
MIT License

Copyright (c) 2019 tabdelaal

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: Snakemake/README.md
================================================
# scRNAseq_Benchmark
Benchmarking classification tools for scRNA-seq data

## How to use
[snakemake](https://snakemake.readthedocs.io/en/stable/index.html) and
[singularity](https://www.sylabs.io/docs/) need to be available on your 
system. You will need to run this on a linux system, as singularity
only supports linux.

From the root of this repository:
```
snakemake \
  --configfile <configfile> \
  --use-singularity
```

If your data or output directory is not located under the root of this
repository, be sure to tell snakemake to mount the appropriate directories
in singularity:
```
snakemake \
  --configfile <configfile> \
  --use-singularity \
  --singularity-args '--bind <location of inputs>:<location of inputs> --bind <output directory>:<output directory>'
```

#### The config file
```YML
output_dir: <path to outputs directory>
datafile: <path to csv file with counts per cell>
labfile: <csv with true labels per cell>
column: <The index of the column in the labels file which ought to be used, defaults to 1>
number_of_features: <number of features to be used as input for the classification methods, 0 means all, defaults to 0>
genes: <path to gene name list, only needed for garnett_CV and Garnett_Pretrained>
human: <whether or not the data is human, true means human, false means mouse, defaults to true>
tools_to_run: # List of tools to run
  - <tool 1>
  - <tool 2>
  - <...>
```

##### Tool specific inputs
Some tools require specific inputs. Add the following to your config file when
one of these tools:
- Garnett_CV
  ```YML
  Garnett_CV:
    markers: <path to Gernett marker gene file>
  ```
- Garnett_Pretrained
  ```YML
  Garnett_Pretrained:
    classifier: <path to Gernett classifier>
  ```

<!-- TODO explain these input files -->

## Included tools/methods
- kNN50
- kNN9
- LDA
- LDA_rejection (LDA with rejection option)
- NMC
- RF
- SVM
- SVM (SVM with rejection option)
- [singleCellNet](https://github.com/pcahan1/singleCellNet)
- [CHETAH](https://github.com/jdekanter/CHETAH)
- [scmap](https://github.com/hemberg-lab/scmap)
  - scmapcell
  - scmapcluster
- [SingleR](https://github.com/dviraran/SingleR)
- [scID](https://github.com/BatadaLab/scID)
- [scVI](https://github.com/YosefLab/scVI)
- [Cell_BLAST](https://github.com/gao-lab/Cell_BLAST)
- [Garnett](https://cole-trapnell-lab.github.io/garnett/)
  - Garnett_CV (without pretrained classifier)
  - Garnett_Pretrained (with pretrained classifier)

## Adding new tools
In order to add a tool to this benchmarking workflow, a rule for this tool
needs to be added to the `Snakefile`. This rule should produce as output:
- a table of predicted label (`<output directory/<tool>/<tool>_pred.csv`).
- a table of true labels (`<output directory/<tool>/<tool>_true.csv`).
- a tables of testing, prediction and/or total time:
  - `<output directory>/<tool>/<tool>_test_time.csv`
  - `<output directory>/<tool>/<tool>_training_time.csv`
  - `<output directory>/<tool>/<tool>_total_time.csv`

The input to this rule should be:
- a count table (specified as the `datafile` in the config).
- a true labels file (specified as the `labfile` in the config).

You will want to write a wrapper script for the tool you want to
add to facilitate this. The `"{output_dir}/CV_folds.RData"` input may be
used to provide your wrapper script with folds for cross_validation.
It is recommended to make a docker image containing all dependencies for both
the tool and any wrappers for the tool.  
This wrapper script should also make a selection of the features to be used.
This selection should be based on ranking which can be accessed by providing
`feature ranking` as input to the wrapper script. The number of features to be
used should be configurable and settable through the 'number_of_features' field
in the config.

The following can be used as a template for new rules. Replace everything
surrounded by (and including the) `<>` with appropriate values.
```
rule SVM:
  input:
    datafile = config["datafile"],
    labfile = config["labfile"],
    folds = "{output_dir}/CV_folds.RData",
    ranking = feature_ranking
  output:
    pred = "{output_dir}/<tool name>/<tool name>_pred.csv",
    true = "{output_dir}/<tool name>/<tool name>_true.csv",
    test_time = "{output_dir}/<tool name>/<tool name>_test_time.csv",
    training_time = "{output_dir}/<tool name>/<tool name>_training_time.csv"
  log: "{output_dir}/<tool name>/<tool name>.log"
  params:
    n_features = config.get("number_of_features", 0)
  singularity: "docker://<docker image>"
  shell:
    "<python or Rscript> <wrapper script> "
    "{input.datafile} "
    "{input.labfile} "
    "{input.folds} "
    "{wildcards.output_dir}/<tool name> "
    "{input.ranking} "
    "{params.n_features} "
    "&> {log}"
```


================================================
FILE: Snakemake/Scripts/run_ACTINN.py
================================================
import os 
import numpy as np
import pandas as pd
import time as tm
import rpy2.robjects as robjects

def run_ACTINN(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    '''
    run ACTINN
    Wrapper script to run ACTINN on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''
    
    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1 
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
    
    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep]
    
    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
    
    # folder with results
    os.chdir(OutputDir)
    
    tot=[]
    truelab = []
    pred = []

    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
    
        train=data.iloc[train_ind_i]
        test=data.iloc[test_ind_i]
        y_train=labels.iloc[train_ind_i]
        y_test=labels.iloc[test_ind_i]
        
        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes,i]
            train = train.iloc[:,feat_to_use]
            test = test.iloc[:,feat_to_use]
        
        train = train.transpose()
        test = test.transpose()
        
        train.to_csv("train.csv")
        test.to_csv("test.csv")
        y_train.to_csv("train_lab.csv", header = False, index = True, sep = '\t')
        y_test.to_csv("test_lab.csv", header = False, index = True, sep = '\t')
        
        tm.sleep(60)
            
        os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_format.py -i train.csv -o train -f csv")
        os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_format.py -i test.csv -o test -f csv")
        
        start = tm.time()
        os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_predict.py -trs train.h5 -trl train_lab.csv -ts test.h5")    
        tot.append(tm.time()-start)
        
        tm.sleep(60)

        truelab.extend(y_test.values)
        predlabels = pd.read_csv('predicted_label.txt',header=0,index_col=None, sep='\t', usecols = [1])            
        pred.extend(predlabels.values)
    
            
    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)
    tot_time = pd.DataFrame(tot)
    
    if (NumGenes == 0):  
        truelab.to_csv("ACTINN_True_Labels.csv", index = False)
        pred.to_csv("ACTINN_Pred_Labels.csv", index = False)
        tot_time.to_csv("ACTINN_Total_Time.csv", index = False)
    else:
        truelab.to_csv("ACTINN_" + str(NumGenes) + "_True_Labels.csv", index = False)
        pred.to_csv("ACTINN_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
        tot_time.to_csv("ACTINN_" + str(NumGenes) + "_Total_Time.csv", index = False)
        
        
================================================
FILE: Snakemake/Scripts/run_CHETAH.R
================================================
args <- commandArgs(TRUE)

run_CHETAH<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
  "
  run CHETAH
  Wrapper script to run CHETAH on a benchmark dataset with 5-fold cross validation,
  outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
  Parameters
  ----------
  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
  as row names and gene names as column names.
  LabelsPath : Cell population annotations file path (.csv).
  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
  OutputDir : Output directory defining the path of the exported file.
  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
  defining the genes order for each cross validation fold, default is NULL.
  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
  "
  
  Data <- read.csv(DataPath,row.names = 1)
  Labels <- as.matrix(read.csv(LabelsPath))
  load(CV_RDataPath)
  Labels <- as.vector(Labels[,col_Index])
  Data <- Data[Cells_to_Keep,]
  Labels <- Labels[Cells_to_Keep]
  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
    GenesOrder = read.csv(GeneOrderPath)
  }
  
  #############################################################################
  #                                CHETAH                                     #
  #############################################################################
  library(CHETAH)
  library(SingleCellExperiment)
  True_Labels_CHETAH <- list()
  Pred_Labels_CHETAH <- list()
  Total_Time_CHETAH <- list()
  Data = t(as.matrix(Data))
  
  for (i in c(1:n_folds)){
    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
      sce <- SingleCellExperiment(assays = list(counts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), 
                                  colData = data.frame(celltypes = Labels[Train_Idx[[i]]]))
      
      sce_test <- SingleCellExperiment(assays = list(counts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), 
                                       colData = data.frame(celltypes = Labels[Test_Idx[[i]]]))
      start_time <- Sys.time()
      sce_test <- CHETAHclassifier(input = sce_test, ref_cells = sce, n_genes = NumGenes)
      end_time <- Sys.time()
    }
    else{
      sce <- SingleCellExperiment(assays = list(counts = Data[,Train_Idx[[i]]]), 
                                  colData = data.frame(celltypes = Labels[Train_Idx[[i]]]))
      
      sce_test <- SingleCellExperiment(assays = list(counts = Data[,Test_Idx[[i]]]), 
                                       colData = data.frame(celltypes = Labels[Test_Idx[[i]]]))
      start_time <- Sys.time()
      sce_test <- CHETAHclassifier(input = sce_test, ref_cells = sce)
      end_time <- Sys.time()
    }
    
    Total_Time_CHETAH[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    True_Labels_CHETAH[i] <- list(Labels[Test_Idx[[i]]])
    Pred_Labels_CHETAH[i] <- list(sce_test$celltype_CHETAH)
  }
  True_Labels_CHETAH <- as.vector(unlist(True_Labels_CHETAH))
  Pred_Labels_CHETAH <- as.vector(unlist(Pred_Labels_CHETAH))
  Total_Time_CHETAH <- as.vector(unlist(Total_Time_CHETAH))
  write.csv(True_Labels_CHETAH,paste0(OutputDir,'/CHETAH_true.csv'),row.names = FALSE)
  write.csv(Pred_Labels_CHETAH,paste0(OutputDir,'/CHETAH_pred.csv'),row.names = FALSE)
  write.csv(Total_Time_CHETAH,paste0(OutputDir,'/CHETAH_total_time.csv'),row.names = FALSE)
}

if (args[6] == "0") {
  run_CHETAH(args[1], args[2], args[3], args[4])
} else {
  run_CHETAH(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6]))
}


================================================
FILE: Snakemake/Scripts/run_CaSTLe.R
================================================
run_CaSTLe<-function(DataPath,LabelsPath,CV_RDataPath, OutputDir, GeneOrderPath = NULL, NumGenes = NULL){
  "
  run CaSTLe
  Wrapper script to run CaSTLe on a benchmark dataset with 5-fold cross validation,
  outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
  Parameters
  ----------
  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
  as row names and gene names as column names.
  LabelsPath : Cell population annotations file path (.csv).
  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
  OutputDir : Output directory defining the path of the exported file.
  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
  defining the genes order for each cross validation fold, default is NULL.
  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
  "
  
  Data <- read.csv(DataPath,row.names = 1)
  Labels <- as.matrix(read.csv(LabelsPath))
  load(CV_RDataPath)
  Labels <- as.vector(Labels[,col_Index])
  Data <- Data[Cells_to_Keep,]
  Labels <- Labels[Cells_to_Keep]
  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
    GenesOrder = read.csv(GeneOrderPath)
  }
  
  #############################################################################
  #                                CaSTLe                                     #
  #############################################################################
  library(igraph)
  library(xgboost)
  True_Labels_Castle <- list()
  Pred_Labels_Castle <- list()
  Training_Time_Castle <- list()
  Testing_Time_Castle <- list()
  
  BREAKS=c(-1, 0, 1, 6, Inf)
  nFeatures = 100
  
  for(i in c(1:n_folds)){
    # 1. Load datasets
    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
      ds1 = Data[Train_Idx[[i]],as.vector(GenesOrder[c(1:NumGenes),i])+1]
      ds2 = Data[Test_Idx[[i]],as.vector(GenesOrder[c(1:NumGenes),i])+1]
    }
    else{
      ds1 = Data[Train_Idx[[i]],]
      ds2 = Data[Test_Idx[[i]],]
    }
    
    sourceCellTypes = as.factor(Labels[Train_Idx[[i]]])
    targetCellTypes = as.factor(Labels[Test_Idx[[i]]])
    
    start_time <- Sys.time()
    # 2. Unify sets, excluding low expressed genes
    source_n_cells_counts = apply(ds1, 2, function(x) { sum(x > 0) } )
    target_n_cells_counts = apply(ds2, 2, function(x) { sum(x > 0) } )
    common_genes = intersect( colnames(ds1)[source_n_cells_counts>10], 
                              colnames(ds2)[target_n_cells_counts>10])
    remove(source_n_cells_counts, target_n_cells_counts)
    ds1 = ds1[, colnames(ds1) %in% common_genes]
    ds2 = ds2[, colnames(ds2) %in% common_genes]
    ds = rbind(ds1[,common_genes], ds2[,common_genes])
    isSource = c(rep(TRUE,nrow(ds1)), rep(FALSE,nrow(ds2)))
    remove(ds1, ds2)
    
    # 3. Highest mean in both source and target
    topFeaturesAvg = colnames(ds)[order(apply(ds, 2, mean), decreasing = T)]
    end_time <- Sys.time()
    Training_Time_Castle[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    start_time <- Sys.time()
    # for each cell - what is the most probable classification?
    L = length(levels(sourceCellTypes))
    targetClassification = as.data.frame(matrix(rep(0,L*sum(!isSource)), nrow=L), row.names = levels(sourceCellTypes))
    
    for (cellType in levels(sourceCellTypes)) {
      
      inSourceCellType = as.factor(ifelse(sourceCellTypes == cellType, cellType, paste0("NOT",cellType)))
      
      # 4. Highest mutual information in source
      topFeaturesMi = names(sort(apply(ds[isSource,],2,function(x) { compare(cut(x,breaks=BREAKS),inSourceCellType,method = "nmi") }), decreasing = T))
      
      # 5. Top n genes that appear in both mi and avg
      selectedFeatures = union(head(topFeaturesAvg, nFeatures) , head(topFeaturesMi, nFeatures) )
      
      # 6. remove correlated features
      tmp = cor(ds[,selectedFeatures], method = "pearson")
      tmp[!lower.tri(tmp)] = 0
      selectedFeatures = selectedFeatures[apply(tmp,2,function(x) any(x < 0.9))]
      remove(tmp)
      
      # 7,8. Convert data from continous to binned dummy vars
      # break datasets to bins
      dsBins = apply(ds[, selectedFeatures], 2, cut, breaks= BREAKS)
      # use only bins with more than one value
      nUniq = apply(dsBins, 2, function(x) { length(unique(x)) })
      # convert to dummy vars
      ds0 = model.matrix(~ . , as.data.frame(dsBins[,nUniq>1]))
      remove(dsBins, nUniq)
      
      cat(paste0("<h2>Classifier for ",cellType,"</h2>"))
      
      inTypeSource = sourceCellTypes == cellType
      # 9. Classify
      xg=xgboost(data=ds0[isSource,] , 
                 label=inTypeSource,
                 objective="binary:logistic", 
                 eta=0.7 , nthread=1, nround=20, verbose=0,
                 gamma=0.001, max_depth=5, min_child_weight=10)
      
      # 10. Predict
      inTypeProb = predict(xg, ds0[!isSource, ])
      
      targetClassification[cellType,] = inTypeProb
    }
    end_time <- Sys.time()
    Testing_Time_Castle[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    True_Labels_Castle[i] <- list(Labels[Test_Idx[[i]]])
    Pred_Labels_Castle[i] <- list(rownames(targetClassification)[apply(targetClassification,2,which.max)])
  }
  True_Labels_Castle <- as.vector(unlist(True_Labels_Castle))
  Pred_Labels_Castle <- as.vector(unlist(Pred_Labels_Castle))
  Training_Time_Castle <- as.vector(unlist(Training_Time_Castle))
  Testing_Time_Castle <- as.vector(unlist(Testing_Time_Castle))
  
  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
    write.csv(True_Labels_Castle,paste('True_Labels_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)
    write.csv(Pred_Labels_Castle,paste('Pred_Labels_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)
    write.csv(Training_Time_Castle,paste('Training_Time_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)
    write.csv(Testing_Time_Castle,paste('Testing_Time_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)
  }
  else{
    write.csv(True_Labels_Castle,'True_Labels_CaSTLe.csv',row.names = FALSE)
    write.csv(Pred_Labels_Castle,'Pred_Labels_CaSTLe.csv',row.names = FALSE)
    write.csv(Training_Time_Castle,'Training_Time_CaSTLe.csv',row.names = FALSE)
    write.csv(Testing_Time_Castle,'Testing_Time_CaSTLe.csv',row.names = FALSE)
  }
  
}

================================================
FILE: Snakemake/Scripts/run_Cell_BLAST.py
================================================
import os
from sys import argv
from pathlib import Path
import time as tm
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

import tensorflow as tf
tf.logging.set_verbosity(0)

import Cell_BLAST as cb
import numpy as np
from numpy import genfromtxt as gft
import rpy2.robjects as robjects


def run_Cell_BLAST(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    '''
    run Cell_BLAST
    Wrapper script to run Cell_BLAST on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''
        
    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1 
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])
    
    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')

    # read the data and labels
    data_old = cb.data.ExprDataSet.read_table(DataPath,orientation="cg", sep=",", index_col = 0, header = 0, sparsify = True).normalize()
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
    
    data = cb.data.ExprDataSet(data_old.exprs[tokeep],data_old.obs.iloc[tokeep],data_old.var,data_old.uns)

    labels = gft(LabelsPath, dtype = "str", skip_header = 1, delimiter = ",", usecols = col)      
    labels = labels[tokeep]
   
    truelab = []
    pred = []
    tr_time = []
    ts_time = []
    
    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1

        train=data[train_ind_i,:]
        test=data[test_ind_i,:]
        y_train = labels[train_ind_i]
        y_test = labels[test_ind_i]
        
        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes,i]
            train = train[:,feat_to_use]
            test = test[:,feat_to_use]

        
        train.obs['cell_type'] = y_train
                
        start = tm.time()
                
        # reduce dimensions
        num_epoch = 50
        models = []
    
        for j in range(4):
            models.append(cb.directi.fit_DIRECTi(train, epoch=num_epoch, patience=10, random_seed = j, path="%d" % j))
    
        # train model
        blast = cb.blast.BLAST(models, train).build_empirical()
        tr_time.append(tm.time()-start)
        
        # predict labels
        start = tm.time()
        test_pred = blast.query(test).annotate('cell_type')
        ts_time.append(tm.time()-start)

        truelab.extend(y_test)
        pred.extend(test_pred.values)
    
    #write results    
    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)
            
    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)

    truelab.to_csv(str(Path(OutputDir+"/Cell_BLAST_true.csv")),index = False)
    pred.to_csv(str(Path(OutputDir+"/Cell_BLAST_pred.csv")),index = False)
    tr_time.to_csv(str(Path(OutputDir+"/Cell_BLAST_training_time.csv")), index = False)
    ts_time.to_csv(str(Path(OutputDir+"/Cell_BLAST_test_time.csv")),index = False)


run_Cell_BLAST(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))


================================================
FILE: Snakemake/Scripts/run_DigitalCellSorter.py
================================================
import numpy as np
import pandas as pd
import scripts.DigitalCellSorter as DigitalCellSorter
import os
import time as tm
import rpy2.robjects as robjects

def run_DigitalCellSorter(DataPath, LabelsPath, CV_RDataPath, GeneListPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    '''
    run DigitalCellSorter
    Wrapper script to run DigitalCellSorter on a benchmark dataset using a predefined genelist,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.  
  
    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    GeneListPath : Data file path to the genest.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''
        
    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1
    
    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    data = data.iloc[tokeep]
    
    truelab = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
    truelab = truelab.iloc[tokeep]


    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
        feat_to_use = features.iloc[0:NumGenes,0]
        data = data.iloc[:,feat_to_use]
        
    data = data.transpose()
    
    # number of different cell types in the data?
    n_clusters = 8
    AvailableCPUsCount = 1
    N_samples_for_distribution = 10000
        
    start = tm.time()
    pred = DigitalCellSorter.DigitalCellSorter().Process(data, 'DigitalCellSorter_Zhang', 
                                                saveDir = OutputDir, 
                                                geneListFileName = GeneListPath,
                                                N_samples_for_distribution = N_samples_for_distribution,
                                                AvailableCPUsCount = AvailableCPUsCount,
                                                clusterIndex=None,
                                                clusterName=None,
                                                n_clusters=n_clusters)	
    runtime = tm.time() - start 
    
    os.chdir(OutputDir)
    
    results = pd.read_excel('DigitalCellSorter_Zhang_voting.xlsx',header=0,index_col=None, usecols=[11])

    prediction = np.zeros(np.shape(pred), dtype='>U10')
    
    for i in range(len(results)):
    	prediction[np.where(pred == i)] = results.values[i]
    
    prediction = pd.DataFrame(prediction)
        
    if (NumGenes == 0):  
        truelab.to_csv("DigitalCellSorter_True_Labels.csv", index = False)
        prediction.to_csv("DigitalCellSorter_Pred_Labels.csv", index = False)
        with open("DigitalCellSorter_Total_Time.csv", 'w') as f:
            f.write("%f\n" % runtime)
    else:
        truelab.to_csv("DigitalCellSorter_" + str(NumGenes) + "_True_Labels.csv", index = False)
        prediction.to_csv("DigitalCellSorter_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
        with open("DigitalCellSorter_" + str(NumGenes) + "_Total_Time.csv", 'w') as f:
            f.write("%f\n" % runtime)

            
================================================
FILE: Snakemake/Scripts/run_Garnett_CV.R
================================================
args <- commandArgs(TRUE)

run_Garnett_CV <- function(DataPath, LabelsPath, CV_RDataPath, GenesPath, MarkerPath, OutputDir, Human){
  "
  run Garnett
  Wrapper script to run Garnett on a benchmark dataset with 5-fold cross validation,
  outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
  Parameters
  ----------
  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
  as row names and gene names as column names.
  LabelsPath : Cell population annotations file path (.csv).
  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
  GenesPath : Path to the file with the genenames
  MarkerPath : Path to the file with marker genes
  OutputDir : Output directory defining the path of the exported file.
  Human : boolean indicating whether the dataset is human (TRUE) or mouse (FALSE)
  "

  # load needed libraries
  library(garnett)
  if (Human) {
    library(org.Hs.eg.db)
  } else {
    library(org.Mm.eg.db)
  }
  
  # load the CVFile
  load(CV_RDataPath)
  
  # read the labels
  labels <- as.matrix(read.csv(LabelsPath))
  labels <- as.vector(labels[,col_Index])
  labels <- labels[Cells_to_Keep]
  
  # read the data
  mat <- read.table(DataPath, sep = ",")
  data <- mat[-1,-1]
  data <- data[Cells_to_Keep,]
  data <- t(data) #ensure that the genes are rows, and the cells are columns
  
  cells <- mat[-1,1]
  cells <- cells[Cells_to_Keep]
  
  # read the genefile 
  fdata <- read.table(GenesPath)
  names(fdata) <- 'gene_short_name'
  row.names(fdata) <- fdata$gene_short_name
  fd <- new("AnnotatedDataFrame", data = fdata)
  
  true_labels <- list()
  pred_labels <- list()
  train_time <- list()
  test_time <- list()
  
  for (i in c(1:n_folds)){
    lab_train = labels[Train_Idx[[i]]]
    lab_test = labels[Test_Idx[[i]]]
    
    train = data[,Train_Idx[[i]]]
    test = data[,Test_Idx[[i]]]
    
    cells_train = cells[Train_Idx[[i]]]
    cells_test = cells[Test_Idx[[i]]]
    
    pdata_train = data.frame(cells_train)
    pdata_test = data.frame(cells_test)
    
    row.names(train) <- row.names(fdata)
    row.names(test) <- row.names(fdata)
    colnames(train) <- row.names(pdata_train)
    colnames(test) <- row.names(pdata_test)
    
    pd_train <- new("AnnotatedDataFrame", data = pdata_train)
    pd_test <- new("AnnotatedDataFrame", data = pdata_test)
    
    pbmc_cds_train <- newCellDataSet(as(train, "dgCMatrix"), phenoData = pd_train, featureData = fd)
    pbmc_cds_test <- newCellDataSet(as(test, "dgCMatrix"), phenoData = pd_test, featureData = fd)
    
    pbmc_cds_train <- estimateSizeFactors(pbmc_cds_train)
    pbmc_cds_test <- estimateSizeFactors(pbmc_cds_test)
    
    # training
    start_train <- Sys.time()
    
    if (Human){
      pbmc_classifier <- train_cell_classifier(cds = pbmc_cds_train, 
                                               marker_file = MarkerPath,
                                               db=org.Hs.eg.db,
                                               cds_gene_id_type = "SYMBOL",
                                               num_unknown = 50,
                                               marker_file_gene_id_type = "SYMBOL")
    } else {
      pbmc_classifier <- train_cell_classifier(cds = pbmc_cds_train, 
                                               marker_file = MarkerPath,
                                               db=org.Mm.eg.db,
                                               cds_gene_id_type = "SYMBOL",
                                               num_unknown = 50,
                                               marker_file_gene_id_type = "SYMBOL")
      
    }
    end_train <- Sys.time()
    train_time[i] <- as.numeric(end_train - start_train)
    
    # testing
    start_test <- Sys.time()
    
    if (Human) {
      pbmc_cds_test <- classify_cells(pbmc_cds_test, 
                                      pbmc_classifier, 
                                      db = org.Hs.eg.db, 
                                      cluster_extend = TRUE,
                                      cds_gene_id_type = "SYMBOL")
    } else {
      pbmc_cds_test <- classify_cells(pbmc_cds_test, 
                                      pbmc_classifier, 
                                      db = org.Mm.eg.db, 
                                      cluster_extend = TRUE,
                                      cds_gene_id_type = "SYMBOL")
    }
    end_test <- Sys.time()
    test_time[i] <- as.numeric(end_test - start_test)
    
    true_labels[i] <- list(lab_test)
    pred_labels[i] <- list(pData(pbmc_cds_test)$cluster_ext_type)
    
    
  }
  
  true_labels <- as.vector(unlist(true_labels))
  pred_labels <- as.vector(unlist(pred_labels))
  train_time <- as.vector(unlist(train_time))
  test_time <- as.vector(unlist(test_time))

  write.csv(true_labels,paste0(OutputDir,'/Garnett_CV_true.csv'),row.names = FALSE)
  write.csv(pred_labels,paste0(OutputDir,'/Garnett_CV_pred.csv'),row.names = FALSE)
  write.csv(train_time,paste0(OutputDir,'/Garnett_CV_training_time.csv'),row.names = FALSE)
  write.csv(test_time,paste0(OutputDir,'/Garnett_CV_test_time.csv'),row.names = FALSE)

}

run_Garnett_CV(args[1], args[2], args[3], args[4], args[5], args[6], args[7])


================================================
FILE: Snakemake/Scripts/run_Garnett_Pretrained.R
================================================
args <- commandArgs(TRUE)

run_Garnett_Pretrained <- function(DataPath, LabelsPath, GenesPath, CV_RDataPath, ClassifierPath, OutputDir, Human){
  "
  run Garnett
  Wrapper script to run Garnett on a benchmark dataset with a pretrained classifier,
  outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
  Parameters
  ----------
  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
  as row names and gene names as column names.
  LabelsPath : Cell population annotations file path (.csv).
  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
  GenesPath : Path to the file with the genenames
  ClassifierPath : Path to the pretrained classifier
  OutputDir : Output directory defining the path of the exported file.
  Human : boolean indicating whether the dataset is human (TRUE) or mouse (FALSE)
  "
  # load needed libraries
  library(garnett)
  
  if (Human) {
    library(org.Hs.eg.db)
  } else {
    library(org.Mm.eg.db)
  }
  
  # load data, genes, and marker file
  load(CV_RDataPath)
  
  load(ClassifierPath)
  
  labels <- as.matrix(read.csv(LabelsPath))
  labels <- labels[Cells_to_Keep]
  
  mat <- read.table(DataPath, sep = ",")
  data <- mat[-1,-1]
  data <- data[Cells_to_Keep,]
  data <- t(data) #ensure that the genes are rows, and the cells are columns
  
  barcodes <- mat[-1,1]
  
  pdata = data.frame(barcodes)
  fdata <- read.table(GenesPath)
  names(fdata) <- 'gene_short_name'
  row.names(fdata) <- fdata$gene_short_name
  
  row.names(data) <- row.names(fdata)
  colnames(data) <- row.names(pdata)
  
  pd <- new("AnnotatedDataFrame", data = pdata)
  fd <- new("AnnotatedDataFrame", data = fdata)
  pbmc_cds <- newCellDataSet(as(data, "dgCMatrix"),
                             phenoData = pd,
                             featureData = fd)
  
  start_time <- Sys.time()
  
  pbmc_cds <- estimateSizeFactors(pbmc_cds)
  
  if (Human){
    pbmc_cds <- classify_cells(pbmc_cds, hsPBMC, db = org.Hs.eg.db, cluster_extend = TRUE, cds_gene_id_type = "SYMBOL")
  } else {
    pbmc_cds <- classify_cells(pbmc_cds, mmLung, db = org.Mm.eg.db, cluster_extend = TRUE, cds_gene_id_type = "SYMBOL")
  }
  
  end_time <- Sys.time()
  
  test_time <- as.numeric(end_time - start_time)

  write.table(pData(pbmc_cds)$cluster_ext_type,
              file = paste0(OutputDir, "/Garnett_Pretrained_pred.csv"), append = FALSE, quote = TRUE, sep = "\t",
              eol = "\n", na = "NA", dec = ".", row.names = FALSE,
              qmethod = c("escape", "double"),
              fileEncoding = "")

  write.csv(labels,paste0(OutputDir,"/Garnett_Pretrained_true.csv"), row.names = FALSE)
  write.csv(test_time,paste0(OutputDir,'/Garnett_Pretrained_test_time.csv'),row.names = FALSE)
}

run_Garnett_Pretrained(args[1], args[2], args[3], args[4], args[5], args[6], args[7])


================================================
FILE: Snakemake/Scripts/run_LAmbDA.py
================================================
# -*- coding: utf-8 -*-
"""
Created on Thu May 23 13:51:15 2019

@author: Lieke
"""

import os 
import numpy as np
import pandas as pd
import time as tm
import rpy2.robjects as robjects
import tensorflow as tf
import math
import scipy.io as sio
import optunity as opt
from tensorflow.contrib.tensor_forest.python import tensor_forest
from tensorflow.python.ops import resources


def run_LAmbDA(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    '''
    run LAmbDA classifier
    Wrapper script to run LAmbDA on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''
        
    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1 
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
    
    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep]
    
    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
    
    # folder with results
    os.chdir(OutputDir)
                
    tr_time=[]
    ts_time=[]
    truelab = np.zeros([len(labels),1],dtype = int)
    predlab = np.zeros([len(labels),1],dtype = int)
        
    for i in range(np.squeeze(nfolds)):
        global X, Y, Gnp, Dnp, train, test, prt, cv
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
                
        X = np.array(data) 
        if (NumGenes > 0):
            X = np.log2(X/10+1)
            feat_to_use = features.iloc[0:NumGenes,i]
            X = X[:,feat_to_use]
        else:
            X = np.log2(np.transpose(select_feats(np.transpose(X),0.5,80))/10+1)
    
        uniq = np.unique(labels)
        Y = np.zeros([len(labels),len(uniq)],int)
        
        for j in range(len(uniq)):
            Y[np.where(labels == uniq[j])[0],j] = 1
    
        Y = np.array(Y)
        
        Gnp = np.zeros([len(uniq),len(uniq)],int)
        np.fill_diagonal(Gnp,1)
        Gnp = np.array(Gnp)
        
        Dnp = np.ones([len(uniq),1],int)
        Dnp = np.array(Dnp)
        
        train_samp = int(np.floor(0.75*len(train_ind_i)))
        test_samp = len(train_ind_i) - train_samp
        perm = np.random.permutation(len(train_ind_i))
        train = perm[0:train_samp]
        test = perm[train_samp:test_samp+1]
        
        while(np.sum(np.sum(Y[train,:],0)<5)>0):
            perm = np.random.permutation(X.shape[0])
            train = perm[0:train_samp+1]
            test = perm[train_samp+1:train_samp+test_samp+1]
        
        cv = i
        optunity_it = 0
        prt = False
        opt_params = None
                    
        start=tm.time()
        opt_params, _, _ = opt.minimize(run_LAmbDA2,solver_name='sobol', gamma=[0.8,1.2], delta=[0.05,0.95], tau=[10.0,11.0], prc_cut=[20,50], bs_prc=[0.2,0.6], num_trees=[10,200], max_nodes=[100,1000], num_evals=50)
        tr_time.append(tm.time()-start)
        
        print("Finished training!")
        
        prt = True
        train = train_ind_i
        test = test_ind_i
        
        start=tm.time()
        err = run_LAmbDA2(opt_params['gamma'], opt_params['delta'], opt_params['tau'], opt_params['prc_cut'], opt_params['bs_prc'], opt_params['num_trees'], opt_params['max_nodes'])
        ts_time.append(tm.time()-start)
        
        tf.reset_default_graph();
        
        predfile = 'preds_cv' + str(cv) + '.mat'
        truefile = 'truth_cv' + str(cv) + '.mat'
        pred = sio.loadmat(predfile)
        truth = sio.loadmat(truefile)
        
        pred = pred['preds']
        truth = truth['labels']
        
        pred_ind = np.argmax(pred,axis=1)
        truth_ind = np.argmax(truth,axis=1)
        
        predlab[test_ind_i,0] = pred_ind
        truelab[test_ind_i,0] = truth_ind
            
                
    truelab = pd.DataFrame(truelab)
    predlab = pd.DataFrame(predlab)
        
    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)
        
    if (NumGenes == 0):  
        truelab.to_csv("LAmbDA_True_Labels.csv", index = False)
        predlab.to_csv("LAmbDA_Pred_Labels.csv", index = False)
        tr_time.to_csv("LAmbDA_Training_Time.csv", index = False)
        ts_time.to_csv("LAmbDA_Testing_Time.csv", index = False)
    else:
        truelab.to_csv("LAmbDA_" + str(NumGenes) + "_True_Labels.csv", index = False)
        predlab.to_csv("LAmbDA_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
        tr_time.to_csv("LAmbDA_" + str(NumGenes) + "_Training_Time.csv", index = False)
        ts_time.to_csv("LAmbDA_" + str(NumGenes) + "_Testing_Time.csv", index = False)


##### Functions copied from LAmbDA's Github
def wt_cutoff(colnum,cutoff,Gtmp,gamma):
	rowsums = np.sum(Gtmp,axis=1);
	return(math.ceil(cutoff*(math.log((max(rowsums)/rowsums[colnum])+1,2)**gamma)))

def resample(prc_cut,Y,Gtmp,train,gamma):
	add = list()
	rem = list()
	colsums = np.sum(Y[train,:],axis=0);
	cutoff = math.ceil(np.percentile(colsums,prc_cut));
	for i in range(len(colsums)):
		if colsums[i] == 0:
			pass
		elif colsums[i] < wt_cutoff(i,cutoff,Gtmp,gamma):
			idx = np.squeeze(np.array(np.where(Y[train,i]>=1)));
			choice = np.random.choice(train[idx],int(wt_cutoff(i,cutoff,Gtmp,gamma)-colsums[i]))
			add = add + choice.tolist();
		elif colsums[i] == wt_cutoff(i,cutoff,Gtmp,gamma):
			pass
		else:
			idx = np.squeeze(np.array(np.where(Y[train,i]>=1)));
			choice = np.random.choice(train[idx],int(colsums[i]-wt_cutoff(i,cutoff,Gtmp,gamma)),replace=False)
			rem = rem + choice.tolist()
	return np.concatenate((list([val for val in train if val not in rem]),add));

def select_feats(Xtmp,num_zero_prc_cut,var_prc_cut):
	#*********************************************************************
	# remove features with many zeros
	num_feat_zeros = np.sum(Xtmp==0,axis=1);
	Xtmp = Xtmp[num_feat_zeros<num_zero_prc_cut*Xtmp.shape[1],:]
	#*********************************************************************
	# remove features with low variance
	feat_vars = np.var(Xtmp,axis=1)
	Xtmp = Xtmp[feat_vars>np.percentile(feat_vars,var_prc_cut),:]
	return(Xtmp)

def get_yn(predict,ys,delta,tau,output_feats):
	D = tf.cast(Dnp, tf.float32);
	G = tf.cast(Gnp, tf.float32);
	ys = tf.cast(ys, tf.float32);
	#print("start")
	Cm = tf.matmul(tf.transpose(tf.matmul(ys,D)),predict+0.1)/tf.reshape(tf.reduce_sum(tf.transpose(tf.matmul(ys,D)),1),(-1,1));
	#print("1")
	mCm = tf.reshape(tf.reduce_mean(tf.cast(tf.matmul(tf.transpose(D),G)>0,tf.float32)*Cm,1),(-1,1));
	#print("2")
	yw = tf.multiply(predict+0.1,tf.matmul(tf.matmul(ys,D),tf.pow(mCm/Cm,tau)));
	#print("3")
	ye = tf.multiply(tf.matmul(ys,G),yw);
	#print("4")
	yt = tf.matmul(ys,tf.matmul(tf.transpose(ys),ye));
	#print("5")
	ya = (delta*yt)+((1-delta)*ye)
	#print("6")
	yn = tf.cast(tf.one_hot(tf.argmax(ya,axis=1),output_feats), dtype=tf.float32)
	#print("7")
	return(yn)

def get_yi(rowsums,G2,ys):
	G2 = tf.cast(G2, tf.float32);
	ys = tf.cast(ys, tf.float32);
	yi = tf.cast(tf.matmul(ys,G2), dtype=tf.float32);
	return(yi)

def run_LAmbDA2(gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes):
	global X, Y, Gnp, Dnp, train, test, prt, cv
	D = tf.cast(Dnp, tf.float32);
	G = tf.cast(Gnp, tf.float32);
	#optunity_it = optunity_it+1;
	num_trees = int(num_trees);
	max_nodes = int(max_nodes);
	prc_cut = int(np.ceil(prc_cut));
	print("gamma=%.4f, delta=%.4f, tau=%.4f, prc_cut=%i, bs_prc=%.4f, num_trees=%i, max_nodes=%i" % (gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes))
	input_feats = X.shape[1];
	num_labls = G.shape.as_list();
	output_feats = num_labls[1];
	#print(output_feats)
	num_labls = num_labls[0];
	rowsums = np.sum(Gnp,axis=1);
	train2 = resample(prc_cut, Y, Gnp, train, gamma);				# Bug??
	bs = int(np.ceil(bs_prc*train2.size))
	xs = tf.placeholder(tf.float32, [None,input_feats])
	#ys = tf.placeholder(tf.float32, [None,num_labls])
	yin = tf.placeholder(tf.int32, [None])
	print("Vars loaded xs and ys created")
	hparams = tensor_forest.ForestHParams(num_classes=output_feats,
									num_features=input_feats,
									num_trees=num_trees,
									max_nodes=max_nodes).fill()
	print("Tensor forest hparams created")								
	forest_graph = tensor_forest.RandomForestGraphs(hparams)
	print("Tensor forest graph created")
	train_op = forest_graph.training_graph(xs, yin)
	loss_op = forest_graph.training_loss(xs, yin)
	print("Loss and train ops created")
	predict, _, _ = forest_graph.inference_graph(xs)
	print("Tensor forest variables created through predict")
	accuracy_op = tf.reduce_mean(tf.reduce_sum(tf.square(tf.one_hot(yin,output_feats)-predict),reduction_indices=[1]))
	print(tf.reduce_sum(tf.square(tf.one_hot(yin,output_feats)-predict),reduction_indices=[1]))
	#predict = tf.one_hot(pred);
	print("Lambda specific variables created")
	# Creating training and testing steps
	G2 = np.copy(Gnp);
	G2[rowsums>1,:] = 0;
	YI = np.matmul(Y,G2);
	YIrs = np.sum(YI,axis=1);
	trainI = train2[np.in1d(train2,np.where(YIrs==1))];
	print("data type trainI,",trainI.dtype)
	testI = test[np.in1d(test,np.where(YIrs==1))];
	print("trainI testI created")
	#init_vars=tf.global_variables_initializer()
	init_vars = tf.group(tf.global_variables_initializer(),
	resources.initialize_resources(resources.shared_resources()))
	sess = tf.Session()
	sess.run(init_vars)
	print("Session started")
	#beep = sess.run(predict,feed_dict={xs:X[1:100,:]});
	#beep = sess.run(predict,feed_dict={xs:X[train2[0:bs],:]});
	tensor_trainI = {xs: X[trainI, :], yin: sess.run(tf.argmax(get_yi(rowsums,G2,Y[trainI, :]),axis=1))}
	print("tensor_trainI made")
	tensor_testI = {xs: X[testI, :], yin: sess.run(tf.argmax(get_yi(rowsums,G2,Y[testI, :]),axis=1))}
	print("tensor_testI made")
	tensor_train = {xs: X[train2[0:bs], :], yin: sess.run(tf.argmax(get_yn(sess.run(predict,feed_dict={xs:X[train2[0:bs],:]}),Y[train2[0:bs], :],delta,tau,output_feats),axis=1))}
	print("tensor_train made")
	tensor_test = {xs: X[test, :], yin: sess.run(tf.argmax(get_yn(sess.run(predict,feed_dict={xs:X[test,:]}),Y[test, :],delta,tau,output_feats),axis=1))}
	print("tensor_test made")
	#**********************************
	#print("Loss and training steps created with sample tensors")
	# Setting params and initializing
	print("Beginning iterations")
	# Starting training iterations
	print(X.shape)
	for i in range(1,101):
		if i < 50:
			sess.run(train_op, feed_dict=tensor_trainI)
			#print("ran train op")
			if i % 10 == 0:
				print(str(sess.run(accuracy_op, feed_dict=tensor_trainI)) + ' ' + str(sess.run(accuracy_op, feed_dict=tensor_testI)))
		else:
			sess.run(train_op, feed_dict=tensor_train)
			if i % 10 == 0:
				print(str(sess.run(accuracy_op, feed_dict=tensor_train)) + ' ' + str(sess.run(accuracy_op, feed_dict=tensor_test)))
			elif i % 10 == 0:
				np.random_shuffle(train2);
				tensor_train = {xs: X[train2[0:bs], :], yin: sess.run(get_yn(sess.run(predict,feed_dict={xs:X[train2[0:bs],:]}),Y[train2[0:bs], :],delta,tau,output_feats))}
	if prt:
		blah = sess.run(predict, feed_dict=tensor_test);
		sio.savemat('preds_cv' + str(cv) + '.mat', {'preds': blah});
		sio.savemat('truth_cv' + str(cv) + '.mat', {'labels': Y[test, :]});
	acc = sess.run(accuracy_op, feed_dict=tensor_test) 
	print("loss1=%.4f, gamma=%.4f, delta=%.4f, tau=%.4f, prc_cut=%i, bs_prc=%.4f, num_trees=%i, max_nodes=%i" % (acc, gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes))
	tf.reset_default_graph();
	return(acc)


================================================
FILE: Snakemake/Scripts/run_LDA.py
================================================
import os
from sys import argv
from pathlib import Path

import numpy as np
import pandas as pd
import time as tm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import rpy2.robjects as robjects


def run_LDA(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    '''
    run baseline classifier: LDA
    Wrapper script to run LDA classifier on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.

    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''

    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)

    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep]

    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')

    # normalize data
    data = np.log1p(data)

    Classifier = LinearDiscriminantAnalysis()

    tr_time=[]
    ts_time=[]
    truelab = []
    pred = []

    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1

        train=data.iloc[train_ind_i]
        test=data.iloc[test_ind_i]
        y_train=labels.iloc[train_ind_i]
        y_test=labels.iloc[test_ind_i]

        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes,i]
            train = train.iloc[:,feat_to_use]
            test = test.iloc[:,feat_to_use]

        start=tm.time()
        Classifier.fit(train, y_train)
        tr_time.append(tm.time()-start)

        start=tm.time()
        predicted = Classifier.predict(test)
        ts_time.append(tm.time()-start)

        truelab.extend(y_test.values)
        pred.extend(predicted)

    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)

    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)

    OutputDir = Path(OutputDir)
    truelab.to_csv(str(OutputDir / Path("LDA_true.csv")),
                   index = False)
    pred.to_csv(str(OutputDir / Path("LDA_pred.csv")),
                index = False)
    tr_time.to_csv(str(OutputDir / Path("LDA_training_time.csv")),
                   index = False)
    ts_time.to_csv(str(OutputDir / Path("LDA_test_time.csv")),
                   index = False)

run_LDA(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))


================================================
FILE: Snakemake/Scripts/run_LDA_rejection.py
================================================
import os
from sys import argv
from pathlib import Path

import numpy as np
import pandas as pd
import time as tm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import rpy2.robjects as robjects


def run_LDA_rejection(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0, Threshold = 0.7):
    '''
    run baseline classifier: LDA
    Wrapper script to run LDA classifier on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.

    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    Threshold : Threshold used when rejecting the genes, default is 0.7.
    '''

    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)

    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep]

    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')

    # normalize data
    data = np.log1p(data)

    Classifier = LinearDiscriminantAnalysis()

    tr_time=[]
    ts_time=[]
    truelab = []
    pred = []

    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1

        train=data.iloc[train_ind_i]
        test=data.iloc[test_ind_i]
        y_train=labels.iloc[train_ind_i]
        y_test=labels.iloc[test_ind_i]

        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes,i]
            train = train.iloc[:,feat_to_use]
            test = test.iloc[:,feat_to_use]

        start=tm.time()
        Classifier.fit(train, y_train)
        tr_time.append(tm.time()-start)

        start=tm.time()
        predicted = Classifier.predict(test)
        prob = np.max(Classifier.predict_proba(test), axis = 1)
        unlabeled = np.where(prob < Threshold)
        predicted[unlabeled] = 'Unknown'
        ts_time.append(tm.time()-start)

        truelab.extend(y_test.values)
        pred.extend(predicted)

    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)

    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)

    OutputDir = Path(OutputDir)
    truelab.to_csv(str(OutputDir / Path("LDA_rejection_true.csv")),
                   index = False)
    pred.to_csv(str(OutputDir / Path("LDA_rejection_pred.csv")),

                index = False)

    tr_time.to_csv(str(OutputDir / Path("LDA_rejection_training_time.csv")),
                   index = False)
    ts_time.to_csv(str(OutputDir / Path("LDA_rejection_test_time.csv")),
                   index = False)

run_LDA(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))


================================================
FILE: Snakemake/Scripts/run_NMC.py
================================================
import os
from sys import argv
from pathlib import Path

import numpy as np
import pandas as pd
import time as tm
from sklearn.neighbors import NearestCentroid
import rpy2.robjects as robjects


def run_NMC(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    '''
    run baseline classifier: NMC
    Wrapper script to run a NMC classifier on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.

    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''

    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)

    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep]

    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')

    # normalize data
    data = np.log1p(data)

    Classifier = NearestCentroid()

    tr_time=[]
    ts_time=[]
    truelab = []
    pred = []

    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1

        train=data.iloc[train_ind_i]
        test=data.iloc[test_ind_i]
        y_train=labels.iloc[train_ind_i]
        y_test=labels.iloc[test_ind_i]

        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes,i]
            train = train.iloc[:,feat_to_use]
            test = test.iloc[:,feat_to_use]

        start=tm.time()
        Classifier.fit(train, y_train)
        tr_time.append(tm.time()-start)

        start=tm.time()
        predicted = Classifier.predict(test)
        ts_time.append(tm.time()-start)

        truelab.extend(y_test.values)
        pred.extend(predicted)

    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)

    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)

    OutputDir = Path(OutputDir)
    truelab.to_csv(str(OutputDir / Path("NMC_true.csv")),
                   index = False)
    pred.to_csv(str(OutputDir / Path("NMC_pred.csv")),
                index = False)
    tr_time.to_csv(str(OutputDir / Path("NMC_training_time.csv")),
                   index = False)
    ts_time.to_csv(str(OutputDir / Path("NMC_test_time.csv")),
                   index = False)

run_NMC(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))


================================================
FILE: Snakemake/Scripts/run_RF.py
================================================
import os
from sys import argv
from pathlib import Path

import numpy as np
import pandas as pd
import time as tm
from sklearn.ensemble import RandomForestClassifier
import rpy2.robjects as robjects


def run_RF(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    '''
    run baseline classifier: RF
    Wrapper script to run a RF classifier with 50 trees on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.

    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''

    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)

    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep]

    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')

    # normalize data
    data = np.log1p(data)

    Classifier = RandomForestClassifier(n_estimators = 50)

    tr_time=[]
    ts_time=[]
    truelab = []
    pred = []

    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1

        train=data.iloc[train_ind_i]
        test=data.iloc[test_ind_i]
        y_train=labels.iloc[train_ind_i]
        y_test=labels.iloc[test_ind_i]

        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes,i]
            train = train.iloc[:,feat_to_use]
            test = test.iloc[:,feat_to_use]

        start=tm.time()
        Classifier.fit(train, y_train)
        tr_time.append(tm.time()-start)

        start=tm.time()
        predicted = Classifier.predict(test)
        ts_time.append(tm.time()-start)

        truelab.extend(y_test.values)
        pred.extend(predicted)

    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)

    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)

    OutputDir = Path(OutputDir)
    truelab.to_csv(str(OutputDir / Path("RF_true.csv")),
                   index = False)
    pred.to_csv(str(OutputDir / Path("RF_pred.csv")),
                index = False)
    tr_time.to_csv(str(OutputDir / Path("RF_training_time.csv")),
                   index = False)
    ts_time.to_csv(str(OutputDir / Path("RF_test_time.csv")),
                   index = False)

run_RF(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))


================================================
FILE: Snakemake/Scripts/run_SCINA.R
================================================
run_SCINA<-function(DataPath,LabelsPath,GeneSigPath,OutputDir){
  "
  run SCINA
  Wrapper script to run SCINA on a benchmark dataset,
  outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
  Parameters
  ----------
  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
  as row names and gene names as column names.
  LabelsPath : Cell population annotations file path (.csv).
  GeneSigPath : Cell type marker genes file path (.csv)
  OutputDir : Output directory defining the path of the exported file.
  "
  
  Data <- read.csv(DataPath,row.names = 1)
  Labels <- as.vector(as.matrix(read.csv(LabelsPath)))
  Data <- Data[is.element(Labels,c('CD14+ Monocyte','CD19+ B','CD56+ NK')),]
  Labels <- Labels[is.element(Labels,c('CD14+ Monocyte','CD19+ B','CD56+ NK'))]
  Labels[Labels == 'CD14+ Monocyte'] <- 'CD14_Monocyte'
  Labels[Labels == 'CD19+ B'] <- 'CD19_B'
  Labels[Labels == 'CD56+ NK'] <- 'CD56_NK'
  
  
  #############################################################################
  #                                 SCINA                                     #
  #############################################################################
  library(SCINA)
  Signature_Genes <- preprocess.signatures(GeneSigPath)
  True_Labels_SCINA <- list()
  Pred_Labels_SCINA <- list()
  Total_Time_SCINA <- list()
  
  library(preprocessCore)
  Data = t(as.matrix(Data))
  Data=log(Data+1)
  Data[]=normalize.quantiles(Data)
  
  start_time <- Sys.time()
  results = SCINA(Data, Signature_Genes)
  end_time <- Sys.time()
  
  True_Labels_SCINA <- Labels
  Pred_Labels_SCINA <- results$cell_labels
  Total_Time_SCINA <- as.numeric(difftime(end_time,start_time,units = 'secs'))
  
  setwd(OutputDir)
  
  write.csv(True_Labels_SCINA,'SCINA_True_Labels.csv',row.names = FALSE)
  write.csv(Pred_Labels_SCINA,'SCINA_Pred_Labels.csv',row.names = FALSE)
  write.csv(Total_Time_SCINA,'SCINA_Total_Time.csv',row.names = FALSE)
}


================================================
FILE: Snakemake/Scripts/run_SVM.py
================================================
import os
from sys import argv
from pathlib import Path

import numpy as np
import pandas as pd
import time as tm
from sklearn.svm import LinearSVC
import rpy2.robjects as robjects


def run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    '''
    run baseline classifier: SVM
    Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.

    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''

    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)

    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep]

    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')

    # normalize data
    data = np.log1p(data)

    Classifier = LinearSVC()

    tr_time=[]
    ts_time=[]
    truelab = []
    pred = []

    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1

        train=data.iloc[train_ind_i]
        test=data.iloc[test_ind_i]
        y_train=labels.iloc[train_ind_i]
        y_test=labels.iloc[test_ind_i]

        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes,i]
            train = train.iloc[:,feat_to_use]
            test = test.iloc[:,feat_to_use]

        start=tm.time()
        Classifier.fit(train, y_train)
        tr_time.append(tm.time()-start)

        start=tm.time()
        predicted = Classifier.predict(test)
        ts_time.append(tm.time()-start)

        truelab.extend(y_test.values)
        pred.extend(predicted)

    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)

    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)

    OutputDir = Path(OutputDir)
    truelab.to_csv(str(OutputDir / Path("SVM_true.csv")),
                   index = False)
    pred.to_csv(str(OutputDir / Path("SVM_pred.csv")),
                index = False)
    tr_time.to_csv(str(OutputDir / Path("SVM_training_time.csv")),
                   index = False)
    ts_time.to_csv(str(OutputDir / Path("SVM_test_time.csv")),
                   index = False)

run_SVM(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))


================================================
FILE: Snakemake/Scripts/run_SVM_rejection.py
================================================
import os
from sys import argv
from pathlib import Path

import numpy as np
import pandas as pd
import time as tm
from sklearn.svm import LinearSVC
import rpy2.robjects as robjects
from sklearn.calibration import CalibratedClassifierCV


def run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0, Threshold = 0.7):
    '''
    run baseline classifier: SVM
    Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.

    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    Threshold : Threshold used when rejecting the cells, default is 0.7.

    '''

    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)

    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep]

    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')

    # normalize data
    data = np.log1p(data)

    Classifier = LinearSVC()
    clf = CalibratedClassifierCV(Classifier)

    tr_time=[]
    ts_time=[]
    truelab = []
    pred = []

    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1

        train=data.iloc[train_ind_i]
        test=data.iloc[test_ind_i]
        y_train=labels.iloc[train_ind_i]
        y_test=labels.iloc[test_ind_i]

        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes,i]
            train = train.iloc[:,feat_to_use]
            test = test.iloc[:,feat_to_use]

        start=tm.time()
        clf.fit(train, y_train)
        tr_time.append(tm.time()-start)

        start=tm.time()
        predicted = clf.predict(test)
        prob = np.max(clf.predict_proba(test), axis = 1)
        unlabeled = np.where(prob < Threshold)
        predicted[unlabeled] = 'Unknown'
        ts_time.append(tm.time()-start)

        truelab.extend(y_test.values)
        pred.extend(predicted)

    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)

    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)

    OutputDir = Path(OutputDir)
    truelab.to_csv(str(OutputDir / Path("SVM_rejection_true.csv")),
                   index = False)
    pred.to_csv(str(OutputDir / Path("SVM_rejection_pred.csv")),
                index = False)
    tr_time.to_csv(str(OutputDir / Path("SVM_rejection_training_time.csv")),

                   index = False)
    ts_time.to_csv(str(OutputDir / Path("SVM_rejection_test_time.csv")),
                   index = False)

run_SVM(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))


================================================
FILE: Snakemake/Scripts/run_SingleR.R
================================================
args <- commandArgs(TRUE)

run_SingleR<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
  "
  run SingleR
  Wrapper script to run SingleR on a benchmark dataset with 5-fold cross validation,
  outputs lists of true and predicted cell labels as csv files, as well as computation time.

  Parameters
  ----------
  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
  as row names and gene names as column names.
  LabelsPath : Cell population annotations file path (.csv).
  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
  OutputDir : Output directory defining the path of the exported file.
  GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
  defining the genes order for each cross validation fold, default is NULL.
  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
  "

  Data <- read.csv(DataPath,row.names = 1)
  Labels <- as.matrix(read.csv(LabelsPath))
  load(CV_RDataPath)
  Labels <- as.vector(Labels[,col_Index])
  Data <- Data[Cells_to_Keep,]
  Labels <- Labels[Cells_to_Keep]
  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
    GenesOrder = read.csv(GeneOrderPath)
  }

  #############################################################################
  #                               SingleR                                     #
  #############################################################################
  library(SingleR)
  library(Seurat)
  True_Labels_SingleR <- list()
  Pred_Labels_SingleR <- list()
  Total_Time_SingleR <- list()
  Data = t(as.matrix(Data))

  for (i in c(1:n_folds)){
    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
      start_time <- Sys.time()
      singler = SingleR(method = "single", Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]],
                        Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]],
                        Labels[Train_Idx[[i]]], numCores = 1)
      end_time <- Sys.time()
    }
    else{
      start_time <- Sys.time()
      singler = SingleR(method = "single", Data[,Test_Idx[[i]]], Data[,Train_Idx[[i]]], Labels[Train_Idx[[i]]], numCores = 1)
      end_time <- Sys.time()
    }
    Total_Time_SingleR[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))

    True_Labels_SingleR[i] <- list(Labels[Test_Idx[[i]]])
    Pred_Labels_SingleR[i] <- list(as.vector(singler$labels))
  }
  True_Labels_SingleR <- as.vector(unlist(True_Labels_SingleR))
  Pred_Labels_SingleR <- as.vector(unlist(Pred_Labels_SingleR))
  Total_Time_SingleR <- as.vector(unlist(Total_Time_SingleR))

  write.csv(True_Labels_SingleR,paste0(OutputDir,'/SingleR_true.csv'),row.names = FALSE)
  write.csv(Pred_Labels_SingleR,paste0(OutputDir,'/SingleR_pred.csv'),row.names = FALSE)
  write.csv(Total_Time_SingleR,paste0(OutputDir,'/SingleR_total_time.csv'),row.names = FALSE)
}

if (args[6] == "0") {
  run_SingleR(args[1], args[2], args[3], args[4])
} else {
  run_SingleR(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6]))
}


================================================
FILE: Snakemake/Scripts/run_kNN50.py
================================================
import os
from sys import argv
from pathlib import Path

import numpy as np
import pandas as pd
import time as tm
from sklearn.neighbors import KNeighborsClassifier
import rpy2.robjects as robjects


def run_kNN50(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    '''
    run baseline classifiers: kNN
    Wrapper script to run kNN (with k = 50) classifier on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.

    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''

    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)

    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep]

    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')

    # normalize data
    data = np.log1p(data)

    Classifier = KNeighborsClassifier(n_neighbors=50)

    tr_time=[]
    ts_time=[]
    truelab = []
    pred = []

    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1

        train=data.iloc[train_ind_i]
        test=data.iloc[test_ind_i]
        y_train=labels.iloc[train_ind_i]
        y_test=labels.iloc[test_ind_i]

        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes,i]
            train = train.iloc[:,feat_to_use]
            test = test.iloc[:,feat_to_use]

        start=tm.time()
        Classifier.fit(train, y_train)
        tr_time.append(tm.time()-start)

        start=tm.time()
        predicted = Classifier.predict(test)
        ts_time.append(tm.time()-start)

        truelab.extend(y_test.values)
        pred.extend(predicted)

    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)

    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)

    OutputDir = Path(OutputDir)
    truelab.to_csv(str(OutputDir / Path("kNN50_true.csv")),
                   index = False)
    pred.to_csv(str(OutputDir / Path("kNN50_pred.csv")),
                index = False)
    tr_time.to_csv(str(OutputDir / Path("kNN50_training_time.csv")),
                   index = False)
    ts_time.to_csv(str(OutputDir / Path("kNN50_test_time.csv")),
                   index = False)

run_kNN50(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))


================================================
FILE: Snakemake/Scripts/run_kNN9.py
================================================
import os
from sys import argv
from pathlib import Path

import numpy as np
import pandas as pd
import time as tm
from sklearn.neighbors import KNeighborsClassifier
import rpy2.robjects as robjects


def run_kNN9(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    '''
    run baseline classifiers: kNN
    Wrapper script to run kNN (with k = 9) classifier on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.

    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''

    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)

    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep]

    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')

    # normalize data
    data = np.log1p(data)

    Classifier = KNeighborsClassifier(n_neighbors=9)

    tr_time=[]
    ts_time=[]
    truelab = []
    pred = []

    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1

        train=data.iloc[train_ind_i]
        test=data.iloc[test_ind_i]
        y_train=labels.iloc[train_ind_i]
        y_test=labels.iloc[test_ind_i]

        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes,i]
            train = train.iloc[:,feat_to_use]
            test = test.iloc[:,feat_to_use]

        start=tm.time()
        Classifier.fit(train, y_train)
        tr_time.append(tm.time()-start)

        start=tm.time()
        predicted = Classifier.predict(test)
        ts_time.append(tm.time()-start)

        truelab.extend(y_test.values)
        pred.extend(predicted)

    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)

    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)

    OutputDir = Path(OutputDir)
    truelab.to_csv(str(OutputDir / Path("kNN9_true.csv")),
                   index = False)
    pred.to_csv(str(OutputDir / Path("kNN9_pred.csv")),
                index = False)
    tr_time.to_csv(str(OutputDir / Path("kNN9_training_time.csv")),
                   index = False)
    ts_time.to_csv(str(OutputDir / Path("kNN9_test_time.csv")),
                   index = False)

run_kNN50(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))


================================================
FILE: Snakemake/Scripts/run_moana.py
================================================
import os
import pandas as pd
import numpy as np
from moana.core import ExpMatrix
from moana.classify import CellTypeClassifier
import time as tm
import rpy2.robjects as robjects

def run_moana(DataPath, LabelsPath, ClassifierPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    '''
    run moana
    Wrapper script to run moana on a benchmark dataset with a pretrained classifier,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.  
  
    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    ClassifierPath : Data file path to the pretrained classifier.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''
    
#    # read the Rdata file
#    robjects.r['load'](CV_RDataPath)
#
#    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
#    col = np.array(robjects.r['col_Index'], dtype = 'int')
#    col = col - 1
    
    matrix = ExpMatrix.read_tsv(DataPath, sep = ',')    
#    matrix = matrix.iloc[tokeep] 
    
    truelab = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',')
#    truelab = truelab.iloc[tokeep]
    
    ct_old = ['CD19+ B','CD14+ Monocyte','CD4+/CD45RA+/CD25- Naive T','CD4+/CD45RO+ Memory','CD8+/CD45RA+ Naive Cytotoxic','Dendritic', 'CD56+ NK']
    ct_new = ['B cells','CD14+ monocytes','Naive CD4+ T cells','Memory CD4+ T cells','Naive CD8+ T cells','Dendritic cells','NK cells']
    
    tokeep2 = np.isin(truelab,ct_old)
    truelab = truelab[tokeep2]
    print(len(truelab))
    matrix = matrix.iloc[np.squeeze(tokeep2)]
    
    for i in range(len(ct_old)):
        truelab.iloc[truelab == ct_old[i]] = ct_new[i]
        
    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
        feat_to_use = features.iloc[0:NumGenes,0]
        matrix = matrix.iloc[:,feat_to_use]

    data = ExpMatrix(X = np.transpose(matrix.X), genes = matrix.cells, cells = matrix.genes)
    data.genes.name = 'Genes'
    data.cells.name = 'Cells'
    data.index.name = 'Genes'
    data.columns.name = 'Cells'
    
    clf = CellTypeClassifier.read_pickle(ClassifierPath)
    
    start = tm.time()
    predictions = clf.predict(data)
    runtime = tm.time() - start
    
    np.asarray(predictions)
    
    pred = pd.DataFrame(predictions)
        
    os.chdir(OutputDir)
            
    if (NumGenes == 0):  
        truelab.to_csv("moana_True_Labels.csv", index = False)
        pred.to_csv("moana_Pred_Labels.csv", index = False)
        with open("moana_Total_Time.csv", 'w') as f:
            f.write("%f\n" % runtime)
    else:
        truelab.to_csv("moana_" + str(NumGenes) + "_True_Labels.csv", index = False)
        pred.to_csv("moana_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
        with open("moana_" + str(NumGenes) + "_Total_Time.csv", 'w') as f:
            f.write("%f\n" % runtime)


================================================
FILE: Snakemake/Scripts/run_scID.R
================================================
args <- commandArgs(TRUE)

run_scID<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
  "
  run scID
  Wrapper script to run scID on a benchmark dataset with 5-fold cross validation,
  outputs lists of true and predicted cell labels as csv files, as well as computation time.

  Parameters
  ----------
  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
  as row names and gene names as column names.
  LabelsPath : Cell population annotations file path (.csv).
  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
  OutputDir : Output directory defining the path of the exported file.
  GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
  defining the genes order for each cross validation fold, default is NULL.
  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
  "

  Data <- read.csv(DataPath,row.names = 1)
  Labels <- as.matrix(read.csv(LabelsPath))
  load(CV_RDataPath)
  Labels <- as.vector(Labels[,col_Index])
  Data <- Data[Cells_to_Keep,]
  Labels <- Labels[Cells_to_Keep]
  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
    GenesOrder = read.csv(GeneOrderPath)
  }

  #############################################################################
  #                                 scID                                      #
  #############################################################################
  library(scID)
  library(Seurat)
  True_Labels_scID <- list()
  Pred_Labels_scID <- list()
  Total_Time_scID <- list()
  Data = t(as.matrix(Data))

  for (i in c(1:n_folds)){
    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
      Train_Labels <- list(Labels[Train_Idx[[i]]])
      names(Train_Labels[[1]]) <- colnames(Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]])
      start_time <- Sys.time()
      scID_output <- scid_multiclass(Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]],
                                     Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]],
                                     Train_Labels[[1]])
      end_time <- Sys.time()
    }
    else{
      Train_Labels <- list(Labels[Train_Idx[[i]]])
      names(Train_Labels[[1]]) <- colnames(Data[,Train_Idx[[i]]])
      start_time <- Sys.time()
      scID_output <- scid_multiclass(Data[,Test_Idx[[i]]], Data[,Train_Idx[[i]]], Train_Labels[[1]])
      end_time <- Sys.time()
    }
    Total_Time_scID[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))

    True_Labels_scID[i] <- list(Labels[Test_Idx[[i]]])
    Pred_Labels_scID[i] <- list(as.vector(scID_output$labels))
  }
  True_Labels_scID <- as.vector(unlist(True_Labels_scID))
  Pred_Labels_scID <- as.vector(unlist(Pred_Labels_scID))
  Total_Time_scID <- as.vector(unlist(Total_Time_scID))

  write.csv(Pred_Labels_scID, paste0(OutputDir,'/scID_pred.csv'),row.names = FALSE)
  write.csv(True_Labels_scID, paste0(OutputDir,'/scID_true.csv'),row.names = FALSE)
  write.csv(Total_Time_scID,paste0(OutputDir,'/scID_total_time.csv'),row.names = FALSE)

}

if (args[6] == "0") {
  run_scID(args[1], args[2], args[3], args[4])
} else {
  run_scID(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6]))
}


================================================
FILE: Snakemake/Scripts/run_scPred.R
================================================
run_scPred<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
  "
  run scPred
  Wrapper script to run scPred on a benchmark dataset with 5-fold cross validation,
  outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
  Parameters
  ----------
  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
  as row names and gene names as column names.
  LabelsPath : Cell population annotations file path (.csv).
  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
  OutputDir : Output directory defining the path of the exported file.
  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
  defining the genes order for each cross validation fold, default is NULL.
  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
  "
  
  Data <- read.csv(DataPath,row.names = 1)
  Labels <- as.matrix(read.csv(LabelsPath))
  load(CV_RDataPath)
  Labels <- as.vector(Labels[,col_Index])
  Data <- Data[Cells_to_Keep,]
  Labels <- Labels[Cells_to_Keep]
  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
    GenesOrder = read.csv(GeneOrderPath)
  }
  
  #############################################################################
  #                                scPred                                     #
  #############################################################################
  library(scPred)
  library(tidyverse)
  library(SingleCellExperiment)
  True_Labels_scPred <- list()
  Pred_Labels_scPred <- list()
  Training_Time_scPred <- list()
  Testing_Time_scPred <- list()
  Data = t(as.matrix(Data))
  
  for (i in c(1:n_folds)){
    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
      sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), 
                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
      sce_counts <- normcounts(sce)
      sce_cpm <- apply(sce_counts, 2, function(x) (x/sum(x))*1000000)
      sce_metadata <- as.data.frame(colData(sce))
      
      sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), 
                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
      sce_counts_test <- normcounts(sce_test)
      sce_cpm_test <- apply(sce_counts_test, 2, function(x) (x/sum(x))*1000000)
      sce_metadata_test <- as.data.frame(colData(sce_test))
    }
    else{
      sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), 
                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
      sce_counts <- normcounts(sce)
      sce_cpm <- apply(sce_counts, 2, function(x) (x/sum(x))*1000000)
      sce_metadata <- as.data.frame(colData(sce))
      
      sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), 
                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
      sce_counts_test <- normcounts(sce_test)
      sce_cpm_test <- apply(sce_counts_test, 2, function(x) (x/sum(x))*1000000)
      sce_metadata_test <- as.data.frame(colData(sce_test))
    }
    
    
    # scPred Training    
    start_time <- Sys.time()
    set.seed(1234)
    scp <- eigenDecompose(sce_cpm)
    scPred::metadata(scp) <- sce_metadata
    scp <- getFeatureSpace(scp, pVar = 'cell_type1')
    # plotEigen(scp, group = 'cell_type1')
    scp <- trainModel(scp)
    # plotTrainProbs(scp)
    end_time <- Sys.time()
    Training_Time_scPred[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    # scPred Prediction
    start_time <- Sys.time()
    scp <- scPredict(scp,newData = sce_cpm_test)
    end_time <- Sys.time()
    Testing_Time_scPred[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    True_Labels_scPred[i] <- list(Labels[Test_Idx[[i]]])
    Pred_Labels_scPred[i] <- list(getPredictions(scp)$predClass)
  }
  True_Labels_scPred <- as.vector(unlist(True_Labels_scPred))
  Pred_Labels_scPred <- as.vector(unlist(Pred_Labels_scPred))
  Training_Time_scPred <- as.vector(unlist(Training_Time_scPred))
  Testing_Time_scPred <- as.vector(unlist(Testing_Time_scPred))
  
  setwd(OutputDir)
  
  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
    write.csv(True_Labels_scPred,paste('scPred_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
    write.csv(Pred_Labels_scPred,paste('scPred_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
    write.csv(Training_Time_scPred,paste('scPred_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)
    write.csv(Testing_Time_scPred,paste('scPred_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)
  }
  else{
    write.csv(True_Labels_scPred,'scPred_True_Labels.csv',row.names = FALSE)
    write.csv(Pred_Labels_scPred,'scPred_Pred_Labels.csv',row.names = FALSE)
    write.csv(Training_Time_scPred,'scPred_Training_Time.csv',row.names = FALSE)
    write.csv(Testing_Time_scPred,'scPred_Testing_Time.csv',row.names = FALSE)
  }
}


================================================
FILE: Snakemake/Scripts/run_scVI.py
================================================
from scvi.dataset import CsvDataset
import os
from sys import argv
from pathlib import Path
from scvi.dataset import CsvDataset
import numpy as np
import pandas as pd
from scvi.models import SCANVI
from scvi.inference import SemiSupervisedTrainer
import time as tm
import rpy2.robjects as robjects

def run_scVI(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    '''
    run scVI
    Wrapper script to run scVI on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''
    
    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1 
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)

    labels = labels.iloc[tokeep]
    data = data.iloc[tokeep] 
    
    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
    
    if (NumGenes == 0):
        #save labels as csv file with header and index column
        labels.to_csv('Labels_scvi.csv')
        data.to_csv('Data_scvi.csv')    
        
        train = CsvDataset('Data_scvi.csv', save_path = "", sep = ",", labels_file = "Labels_scvi.csv", gene_by_cell = False)
        
        ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing
        scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels)
        trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5)
    
    n_epochs = 200
    
    truelab = []
    pred = []
    tr_time = []
    ts_time = []
    
    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
        
        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes,i]
            data2 = data.iloc[:,feat_to_use]

            labels.to_csv(OutputDir +'/Labels_scvi.csv')
            data2.to_csv(OutputDir +'/Data_scvi.csv')

            train = CsvDataset(OutputDir +'/Data_scvi.csv', save_path = "", sep = ",", labels_file = OutputDir +"/Labels_scvi.csv", gene_by_cell = False, new_n_genes = False)

            ## this semisupervised trainer automatically uses a part of the input data for training and a part for testing
            scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels)
            trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5)

        trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(indices=(train_ind_i).ravel(), shuffle = False)
        trainer_scanvi.labelled_set.to_monitor = ['ll','accuracy']
        trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(indices=(test_ind_i).ravel(), shuffle = False)
        trainer_scanvi.unlabelled_set.to_monitor = ['ll','accuracy']
    
        start = tm.time()
        trainer_scanvi.train(n_epochs)
        tr_time.append(tm.time()-start)
    
        ## labels of test set are in y_pred
        ## labels are returned in numbers, should be mapped back to the real labels
        ## indices are permutated
        start = tm.time()
        y_true, y_pred = trainer_scanvi.unlabelled_set.compute_predictions()
        ts_time.append(tm.time()-start)
        
        truelab.extend(y_true)
        pred.extend(y_pred)
    
    #write results

    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)
    
    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)

    truelab.to_csv(str(Path(OutputDir + "/scVI_true.csv")), index=False)
    pred.to_csv(str(Path(OutputDir + "/scVI_pred.csv")), index=False)
    tr_time.to_csv(str(Path(OutputDir + "/scVI_training_time.csv")), index=False)
    ts_time.to_csv(str(Path(OutputDir + "/scVI_test_time.csv")), index=False)

run_scVI(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))


================================================
FILE: Snakemake/Scripts/run_scmap.R
================================================
run_scmap <- function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
  "
  run scmap
  Wrapper script to run scmap on a benchmark dataset with 5-fold cross validation,
  outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
  Parameters
  ----------
  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
  as row names and gene names as column names.
  LabelsPath : Cell population annotations file path (.csv).
  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
  OutputDir : Output directory defining the path of the exported file.
  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
  defining the genes order for each cross validation fold, default is NULL.
  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
  "
  
  Data <- read.csv(DataPath,row.names = 1)
  Labels <- as.matrix(read.csv(LabelsPath))
  load(CV_RDataPath)
  Labels <- as.vector(Labels[,col_Index])
  Data <- Data[Cells_to_Keep,]
  Labels <- Labels[Cells_to_Keep]
  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
    GenesOrder = read.csv(GeneOrderPath)
  }
  
  #############################################################################
  #                                 scmap                                     #
  #############################################################################
  library(scmap)
  library(SingleCellExperiment)
  True_Labels_scmapcluster <- list()
  Pred_Labels_scmapcluster <- list()
  True_Labels_scmapcell <- list()
  Pred_Labels_scmapcell <- list()
  Training_Time_scmapcluster <- list()
  Testing_Time_scmapcluster <- list()
  Training_Time_scmapcell <- list()
  Testing_Time_scmapcell <- list()
  Data = t(as.matrix(Data))
  
  for (i in c(1:n_folds)){
    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
      sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), 
                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
      logcounts(sce) <- log2(normcounts(sce) + 1)
      # use gene names as feature symbols
      rowData(sce)$feature_symbol <- rownames(sce)
      sce <- selectFeatures(sce, n_features = NumGenes, suppress_plot = TRUE)
      
      sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), 
                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
      logcounts(sce_test) <- log2(normcounts(sce_test) + 1)
      rowData(sce_test)$feature_symbol <- rownames(sce_test)
      sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData
    }
    else{
      sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), 
                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
      logcounts(sce) <- log2(normcounts(sce) + 1)
      # use gene names as feature symbols
      rowData(sce)$feature_symbol <- rownames(sce)
      sce <- selectFeatures(sce, suppress_plot = TRUE)
      
      sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), 
                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
      logcounts(sce_test) <- log2(normcounts(sce_test) + 1)
      rowData(sce_test)$feature_symbol <- rownames(sce_test)
      sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData
    }
    
    # scmap-cluster
    start_time <- Sys.time()
    sce <- indexCluster(sce)
    end_time <- Sys.time()
    Training_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    start_time <- Sys.time()
    scmapCluster_results <- scmapCluster(projection = sce_test,index_list = list(metadata(sce)$scmap_cluster_index))
    end_time <- Sys.time()
    Testing_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    True_Labels_scmapcluster[i] <- list(Labels[Test_Idx[[i]]])
    Pred_Labels_scmapcluster[i] <- list(scmapCluster_results$combined_labs)
    
    # scmap-cell
    start_time <- Sys.time()
    set.seed(1)
    sce <- indexCell(sce)
    end_time <- Sys.time()
    Training_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    start_time <- Sys.time()
    scmapCell_results <- scmapCell(sce_test,list(metadata(sce)$scmap_cell_index))
    scmapCell_clusters <- scmapCell2Cluster(scmapCell_results,list(as.character(colData(sce)$cell_type1)))
    end_time <- Sys.time()
    Testing_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    True_Labels_scmapcell[i] <- list(Labels[Test_Idx[[i]]])
    Pred_Labels_scmapcell[i] <- list(scmapCell_clusters$combined_labs)
  }
  
  True_Labels_scmapcluster <- as.vector(unlist(True_Labels_scmapcluster))
  Pred_Labels_scmapcluster <- as.vector(unlist(Pred_Labels_scmapcluster))
  True_Labels_scmapcell <- as.vector(unlist(True_Labels_scmapcell))
  Pred_Labels_scmapcell <- as.vector(unlist(Pred_Labels_scmapcell))
  Training_Time_scmapcluster <- as.vector(unlist(Training_Time_scmapcluster))
  Testing_Time_scmapcluster <- as.vector(unlist(Testing_Time_scmapcluster))
  Training_Time_scmapcell <- as.vector(unlist(Training_Time_scmapcell))
  Testing_Time_scmapcell <- as.vector(unlist(Testing_Time_scmapcell))
  
  setwd(OutputDir)
  
  if (!is.null(GeneOrderPath) & !is.null (NumGenes)){
    write.csv(True_Labels_scmapcluster,paste('scmapcluster_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
    write.csv(Pred_Labels_scmapcluster,paste('scmapcluster_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
    write.csv(True_Labels_scmapcell,paste('scmapcell_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
    write.csv(Pred_Labels_scmapcell,paste('scmapcell_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
    write.csv(Training_Time_scmapcluster,paste('scmapcluster_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)
    write.csv(Testing_Time_scmapcluster,paste('scmapcluster_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)
    write.csv(Training_Time_scmapcell,paste('scmapcell_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)
    write.csv(Testing_Time_scmapcell,paste('scmapcell_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)
  }
  else{
    write.csv(True_Labels_scmapcluster,'scmapcluster_True_Labels.csv',row.names = FALSE)
    write.csv(Pred_Labels_scmapcluster,'scmapcluster_Pred_Labels.csv',row.names = FALSE)
    write.csv(True_Labels_scmapcell,'scmapcell_True_Labels.csv',row.names = FALSE)
    write.csv(Pred_Labels_scmapcell,'scmapcell_Pred_Labels.csv',row.names = FALSE)
    write.csv(Training_Time_scmapcluster,'scmapcluster_Training_Time.csv',row.names = FALSE)
    write.csv(Testing_Time_scmapcluster,'scmapcluster_Testing_Time.csv',row.names = FALSE)
    write.csv(Training_Time_scmapcell,'scmapcell_Training_Time.csv',row.names = FALSE)
    write.csv(Testing_Time_scmapcell,'scmapcell_Testing_Time.csv',row.names = FALSE)
  }
}


================================================
FILE: Snakemake/Scripts/run_scmapcell.R
================================================
args <- commandArgs(TRUE)

run_scmapcell <- function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
  "
  run scmapcell
  Wrapper script to run scmap on a benchmark dataset with 5-fold cross validation,
  outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
  Parameters
  ----------
  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
  as row names and gene names as column names.
  LabelsPath : Cell population annotations file path (.csv).
  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
  OutputDir : Output directory defining the path of the exported file.
  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
  defining the genes order for each cross validation fold, default is NULL.
  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
  "
  
  Data <- read.csv(DataPath,row.names = 1)
  Labels <- as.matrix(read.csv(LabelsPath))
  load(CV_RDataPath)
  Labels <- as.vector(Labels[,col_Index])
  Data <- Data[Cells_to_Keep,]
  Labels <- Labels[Cells_to_Keep]
  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
    GenesOrder = read.csv(GeneOrderPath)
  }
  
  #############################################################################
  #                                 scmap                                     #
  #############################################################################
  library(scmap)
  library(SingleCellExperiment)
  True_Labels_scmapcell <- list()
  Pred_Labels_scmapcell <- list()
  Training_Time_scmapcell <- list()
  Testing_Time_scmapcell <- list()
  Data = t(as.matrix(Data))
  
  for (i in c(1:n_folds)){
    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
      sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), 
                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
      logcounts(sce) <- log2(normcounts(sce) + 1)
      # use gene names as feature symbols
      rowData(sce)$feature_symbol <- rownames(sce)
      sce <- selectFeatures(sce, n_features = NumGenes, suppress_plot = TRUE)
      
      sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), 
                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
      logcounts(sce_test) <- log2(normcounts(sce_test) + 1)
      rowData(sce_test)$feature_symbol <- rownames(sce_test)
      sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData
    }
    else{
      sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), 
                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
      logcounts(sce) <- log2(normcounts(sce) + 1)
      # use gene names as feature symbols
      rowData(sce)$feature_symbol <- rownames(sce)
      sce <- selectFeatures(sce, suppress_plot = TRUE)
      
      sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), 
                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
      logcounts(sce_test) <- log2(normcounts(sce_test) + 1)
      rowData(sce_test)$feature_symbol <- rownames(sce_test)
      sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData
    }
    
    # scmap-cell
    start_time <- Sys.time()
    set.seed(1)
    sce <- indexCell(sce)
    end_time <- Sys.time()
    Training_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    start_time <- Sys.time()
    scmapCell_results <- scmapCell(sce_test,list(metadata(sce)$scmap_cell_index))
    scmapCell_clusters <- scmapCell2Cluster(scmapCell_results,list(as.character(colData(sce)$cell_type1)))
    end_time <- Sys.time()
    Testing_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    True_Labels_scmapcell[i] <- list(Labels[Test_Idx[[i]]])
    Pred_Labels_scmapcell[i] <- list(scmapCell_clusters$combined_labs)
  }
  
  True_Labels_scmapcell <- as.vector(unlist(True_Labels_scmapcell))
  Pred_Labels_scmapcell <- as.vector(unlist(Pred_Labels_scmapcell))
  Training_Time_scmapcell <- as.vector(unlist(Training_Time_scmapcell))
  Testing_Time_scmapcell <- as.vector(unlist(Testing_Time_scmapcell))
  
  write.csv(True_Labels_scmapcell,paste0(OutputDir,'/scmapcell_true.csv'),row.names = FALSE)
  write.csv(Pred_Labels_scmapcell,paste0(OutputDir,'/scmapcell_pred.csv'),row.names = FALSE)
  write.csv(Training_Time_scmapcell,paste0(OutputDir,'/scmapcell_training_time.csv'),row.names = FALSE)
  write.csv(Testing_Time_scmapcell,paste0(OutputDir,'/scmapcell_test_time.csv'),row.names = FALSE)
}
if (args[6] == "0") {
  run_scmapcell(args[1], args[2], args[3], args[4])
} else {
  run_scmapcell(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6]))
}


================================================
FILE: Snakemake/Scripts/run_scmapcluster.R
================================================
args <- commandArgs(TRUE)

run_scmapcluster <- function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
  "
  run scmapcluster
  Wrapper script to run scmap on a benchmark dataset with 5-fold cross validation,
  outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
  Parameters
  ----------
  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
  as row names and gene names as column names.
  LabelsPath : Cell population annotations file path (.csv).
  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
  OutputDir : Output directory defining the path of the exported file.
  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
  defining the genes order for each cross validation fold, default is NULL.
  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
  "
  
  Data <- read.csv(DataPath,row.names = 1)
  Labels <- as.matrix(read.csv(LabelsPath))
  load(CV_RDataPath)
  Labels <- as.vector(Labels[,col_Index])
  Data <- Data[Cells_to_Keep,]
  Labels <- Labels[Cells_to_Keep]
  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
    GenesOrder = read.csv(GeneOrderPath)
  }
  
  #############################################################################
  #                                 scmap                                     #
  #############################################################################
  library(scmap)
  library(SingleCellExperiment)
  True_Labels_scmapcluster <- list()
  Pred_Labels_scmapcluster <- list()
  Training_Time_scmapcluster <- list()
  Testing_Time_scmapcluster <- list()
  Data = t(as.matrix(Data))
  
  for (i in c(1:n_folds)){
    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
      sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), 
                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
      logcounts(sce) <- log2(normcounts(sce) + 1)
      # use gene names as feature symbols
      rowData(sce)$feature_symbol <- rownames(sce)
      sce <- selectFeatures(sce, n_features = NumGenes, suppress_plot = TRUE)
      
      sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), 
                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
      logcounts(sce_test) <- log2(normcounts(sce_test) + 1)
      rowData(sce_test)$feature_symbol <- rownames(sce_test)
      sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData
    }
    else{
      sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), 
                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
      logcounts(sce) <- log2(normcounts(sce) + 1)
      # use gene names as feature symbols
      rowData(sce)$feature_symbol <- rownames(sce)
      sce <- selectFeatures(sce, suppress_plot = TRUE)
      
      sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), 
                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
      logcounts(sce_test) <- log2(normcounts(sce_test) + 1)
      rowData(sce_test)$feature_symbol <- rownames(sce_test)
      sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData
    }
    
    # scmap-cluster
    start_time <- Sys.time()
    sce <- indexCluster(sce)
    end_time <- Sys.time()
    Training_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    start_time <- Sys.time()
    scmapCluster_results <- scmapCluster(projection = sce_test,index_list = list(metadata(sce)$scmap_cluster_index))
    end_time <- Sys.time()
    Testing_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    True_Labels_scmapcluster[i] <- list(Labels[Test_Idx[[i]]])
    Pred_Labels_scmapcluster[i] <- list(scmapCluster_results$combined_labs)
    
  }
  
  True_Labels_scmapcluster <- as.vector(unlist(True_Labels_scmapcluster))
  Pred_Labels_scmapcluster <- as.vector(unlist(Pred_Labels_scmapcluster))
  Training_Time_scmapcluster <- as.vector(unlist(Training_Time_scmapcluster))
  Testing_Time_scmapcluster <- as.vector(unlist(Testing_Time_scmapcluster))

  write.csv(True_Labels_scmapcluster,paste0(OutputDir,'/scmapcluster_true.csv'),row.names = FALSE)
  write.csv(Pred_Labels_scmapcluster,paste0(OutputDir,'/scmapcluster_pred.csv'),row.names = FALSE)
  write.csv(Training_Time_scmapcluster,paste0(OutputDir,'/scmapcluster_training_time.csv'),row.names = FALSE)
  write.csv(Testing_Time_scmapcluster,paste0(OutputDir,'/scmapcluster_test_time.csv'),row.names = FALSE)


}
if (args[6] == "0") {
  run_scmapcluster(args[1], args[2], args[3], args[4])
} else {
  run_scmapcluster(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6]))
}


================================================
FILE: Snakemake/Scripts/run_scmaptotal.R
================================================
run_scmap <- function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
  "
  run scmap
  Wrapper script to run scmap on a benchmark dataset with 5-fold cross validation,
  outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
  Parameters
  ----------
  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
  as row names and gene names as column names.
  LabelsPath : Cell population annotations file path (.csv).
  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
  OutputDir : Output directory defining the path of the exported file.
  GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
  defining the genes order for each cross validation fold, default is NULL.
  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
  "
  
  Data <- read.csv(DataPath,row.names = 1)
  Labels <- as.matrix(read.csv(LabelsPath))
  load(CV_RDataPath)
  Labels <- as.vector(Labels[,col_Index])
  Data <- Data[Cells_to_Keep,]
  Labels <- Labels[Cells_to_Keep]
  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
    GenesOrder = read.csv(GeneOrderPath)
  }
  
  #############################################################################
  #                                 scmap                                     #
  #############################################################################
  library(scmap)
  library(SingleCellExperiment)
  True_Labels_scmapcluster <- list()
  Pred_Labels_scmapcluster <- list()
  True_Labels_scmapcell <- list()
  Pred_Labels_scmapcell <- list()
  Training_Time_scmapcluster <- list()
  Testing_Time_scmapcluster <- list()
  Training_Time_scmapcell <- list()
  Testing_Time_scmapcell <- list()
  Data = t(as.matrix(Data))
  
  for (i in c(1:n_folds)){
    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
      sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]), 
                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
      logcounts(sce) <- log2(normcounts(sce) + 1)
      # use gene names as feature symbols
      rowData(sce)$feature_symbol <- rownames(sce)
      sce <- selectFeatures(sce, n_features = NumGenes, suppress_plot = TRUE)
      
      sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]), 
                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
      logcounts(sce_test) <- log2(normcounts(sce_test) + 1)
      rowData(sce_test)$feature_symbol <- rownames(sce_test)
      sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData
    }
    else{
      sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]), 
                                  colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
      logcounts(sce) <- log2(normcounts(sce) + 1)
      # use gene names as feature symbols
      rowData(sce)$feature_symbol <- rownames(sce)
      sce <- selectFeatures(sce, suppress_plot = TRUE)
      
      sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]), 
                                       colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
      logcounts(sce_test) <- log2(normcounts(sce_test) + 1)
      rowData(sce_test)$feature_symbol <- rownames(sce_test)
      sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData
    }
    
    # scmap-cluster
    start_time <- Sys.time()
    sce <- indexCluster(sce)
    end_time <- Sys.time()
    Training_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    start_time <- Sys.time()
    scmapCluster_results <- scmapCluster(projection = sce_test,index_list = list(metadata(sce)$scmap_cluster_index))
    end_time <- Sys.time()
    Testing_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    True_Labels_scmapcluster[i] <- list(Labels[Test_Idx[[i]]])
    Pred_Labels_scmapcluster[i] <- list(scmapCluster_results$combined_labs)
    
    # scmap-cell
    start_time <- Sys.time()
    set.seed(1)
    sce <- indexCell(sce)
    end_time <- Sys.time()
    Training_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    start_time <- Sys.time()
    scmapCell_results <- scmapCell(sce_test,list(metadata(sce)$scmap_cell_index))
    scmapCell_clusters <- scmapCell2Cluster(scmapCell_results,list(as.character(colData(sce)$cell_type1)))
    end_time <- Sys.time()
    Testing_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
    
    True_Labels_scmapcell[i] <- list(Labels[Test_Idx[[i]]])
    Pred_Labels_scmapcell[i] <- list(scmapCell_clusters$combined_labs)
  }
  
  True_Labels_scmapcluster <- as.vector(unlist(True_Labels_scmapcluster))
  Pred_Labels_scmapcluster <- as.vector(unlist(Pred_Labels_scmapcluster))
  True_Labels_scmapcell <- as.vector(unlist(True_Labels_scmapcell))
  Pred_Labels_scmapcell <- as.vector(unlist(Pred_Labels_scmapcell))
  Training_Time_scmapcluster <- as.vector(unlist(Training_Time_scmapcluster))
  Testing_Time_scmapcluster <- as.vector(unlist(Testing_Time_scmapcluster))
  Training_Time_scmapcell <- as.vector(unlist(Training_Time_scmapcell))
  Testing_Time_scmapcell <- as.vector(unlist(Testing_Time_scmapcell))
  
  setwd(OutputDir)
  
  if (!is.null(GeneOrderPath) & !is.null (NumGenes)){
    write.csv(True_Labels_scmapcluster,paste('scmapcluster_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
    write.csv(Pred_Labels_scmapcluster,paste('scmapcluster_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
    write.csv(True_Labels_scmapcell,paste('scmapcell_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
    write.csv(Pred_Labels_scmapcell,paste('scmapcell_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
    write.csv(Training_Time_scmapcluster,paste('scmapcluster_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)
    write.csv(Testing_Time_scmapcluster,paste('scmapcluster_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)
    write.csv(Training_Time_scmapcell,paste('scmapcell_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)
    write.csv(Testing_Time_scmapcell,paste('scmapcell_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)
  }
  else{
    write.csv(True_Labels_scmapcluster,'scmapcluster_True_Labels.csv',row.names = FALSE)
    write.csv(Pred_Labels_scmapcluster,'scmapcluster_Pred_Labels.csv',row.names = FALSE)
    write.csv(True_Labels_scmapcell,'scmapcell_True_Labels.csv',row.names = FALSE)
    write.csv(Pred_Labels_scmapcell,'scmapcell_Pred_Labels.csv',row.names = FALSE)
    write.csv(Training_Time_scmapcluster,'scmapcluster_Training_Time.csv',row.names = FALSE)
    write.csv(Testing_Time_scmapcluster,'scmapcluster_Testing_Time.csv',row.names = FALSE)
    write.csv(Training_Time_scmapcell,'scmapcell_Training_Time.csv',row.names = FALSE)
    write.csv(Testing_Time_scmapcell,'scmapcell_Testing_Time.csv',row.names = FALSE)
  }
}


================================================
FILE: Snakemake/Scripts/run_singleCellNet.R
================================================
args <- commandArgs(TRUE)

run_singleCellNet<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
  "
  run singleCellNet
  Wrapper script to run singleCellNet on a benchmark dataset with 5-fold cross validation,
  outputs lists of true and predicted cell labels as csv files, as well as computation time.

  Parameters
  ----------
  DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
  as row names and gene names as column names.
  LabelsPath : Cell population annotations file path (.csv).
  CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
  OutputDir : Output directory defining the path of the exported file.
  GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
  defining the genes order for each cross validation fold, default is NULL.
  NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
  "

  Data <- read.csv(DataPath,row.names = 1)
  colnames(Data) <- gsub('_','.',colnames(Data), fixed = TRUE)
  Labels <- as.matrix(read.csv(LabelsPath))
  load(CV_RDataPath)
  Labels <- as.vector(Labels[,col_Index])
  Data <- Data[Cells_to_Keep,]
  Labels <- Labels[Cells_to_Keep]
  if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
    GenesOrder = read.csv(GeneOrderPath)
  }

  #############################################################################
  #                              singleCellNet                                #
  #############################################################################
  library(singleCellNet)
  library(dplyr)
  True_Labels_singleCellNet <- list()
  Pred_Labels_singleCellNet <- list()
  Training_Time_singleCellNet <- list()
  Testing_Time_singleCellNet <- list()
  Data = t(as.matrix(Data))              # deals also with sparse matrix

  for(i in c(1:n_folds)){
    if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
      DataTrain <- Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]
      DataTest <- Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]
    }
    else{
      DataTrain <- Data[,Train_Idx[[i]]]
      DataTest <- Data[,Test_Idx[[i]]]
    }

    start_time <- Sys.time()
    cgenes2<-findClassyGenes(DataTrain, data.frame(Annotation = Labels[Train_Idx[[i]]]), "Annotation")
    cgenesA<-cgenes2[['cgenes']]
    grps<-cgenes2[['grps']]
    DataTrain<-as.matrix(DataTrain[cgenesA,])
    xpairs<-ptGetTop(DataTrain, grps, ncores = 1)
    pdTrain<-query_transform(DataTrain[cgenesA, ], xpairs)
    rf<-sc_makeClassifier(pdTrain[xpairs,], genes=xpairs, groups=grps)
    end_time <- Sys.time()
    Training_Time_singleCellNet[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))

    start_time <- Sys.time()
    DataTest<-query_transform(DataTest[cgenesA,], xpairs)
    classRes <-rf_classPredict(rf, DataTest)
    end_time <- Sys.time()
    Testing_Time_singleCellNet[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))

    True_Labels_singleCellNet[i] <- list(Labels[Test_Idx[[i]]])
    Pred_Labels_singleCellNet[i] <- list((rownames(classRes)[apply(classRes,2,which.max)])[1:length(Test_Idx[[i]])])
  }
  True_Labels_singleCellNet <- as.vector(unlist(True_Labels_singleCellNet))
  Pred_Labels_singleCellNet <- as.vector(unlist(Pred_Labels_singleCellNet))
  Training_Time_singleCellNet <- as.vector(unlist(Training_Time_singleCellNet))
  Testing_Time_singleCellNet <- as.vector(unlist(Testing_Time_singleCellNet))
  write.csv(True_Labels_singleCellNet,paste0(OutputDir,'/singleCellNet_true.csv'),row.names = FALSE)
  write.csv(Pred_Labels_singleCellNet,paste0(OutputDir,'/singleCellNet_pred.csv'),row.names = FALSE)
  write.csv(Training_Time_singleCellNet,paste0(OutputDir,'/singleCellNet_training_time.csv'),row.names = FALSE)
  write.csv(Testing_Time_singleCellNet,paste0(OutputDir,'/singleCellNet_test_time.csv'),row.names = FALSE)
}

if (args[6] == "0") {
  run_singleCellNet(args[1], args[2], args[3], args[4])
} else {
  run_singleCellNet(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6]))
}


================================================
FILE: Snakemake/Snakefile
================================================
dockerTag = "latest" #FIXME tagged versions

def feature_ranking(w):
    if "feature_ranking" in config.keys():
        return config["feature_ranking"]
    else:
        return "{output_dir}/rank_genes_dropouts.csv".format(
            output_dir=w.output_dir)

"""
One rule to... rule... them all...
"""
rule all:
  input:
    tool_outputs = expand(
        "{output_dir}/evaluation/{measure}/{tool}.csv",
        tool=config["tools_to_run"],
        output_dir=config["output_dir"],
        measure=["Confusion", "F1", "PopSize", "Summary"])


"""
Rule for the result evaluation
"""
rule evaluate:
  input:
    true="{output_dir}/{tool}/{tool}_true.csv",
    pred="{output_dir}/{tool}/{tool}_pred.csv"
  output:
    "{output_dir}/evaluation/Confusion/{tool}.csv",
    "{output_dir}/evaluation/F1/{tool}.csv",
    "{output_dir}/evaluation/PopSize/{tool}.csv",
    "{output_dir}/evaluation/Summary/{tool}.csv",
  log: "{output_dir}/evaluation/{tool}.log"
  singularity: "docker://scrnaseqbenchmark/baseline:{}".format(dockerTag)
  shell:
    "Rscript evaluate.R "
    "{input.true} "
    "{input.pred} "
    "{wildcards.output_dir}/evaluation "
    "{wildcards.tool} "
    "&> {log}"


"""
Rule for creating cross validation folds
"""
rule generate_CV_folds:
  input: config["labfile"],
  output: "{output_dir}/CV_folds.RData"
  log: "{output_dir}/CV_folds.log"
  params:
    column = config.get("column", 1) # default to 1
  singularity: "docker://scrnaseqbenchmark/cross_validation:{}".format(dockerTag)
  shell:
    "Rscript Cross_Validation.R "
    "{input} "
    "{params.column} "
    "{wildcards.output_dir} "
    "&> {log}"


"""
Rule for creating feature rank lists
"""
rule generate_dropouts_feature_rankings:
    input:
        datafile = config["datafile"],
        folds = "{output_dir}/CV_folds.RData"
    output: "{output_dir}/rank_genes_dropouts.csv"
    log: "{output_dir}/rank_genes_dropouts.log"
    singularity: "docker://scrnaseqbenchmark/baseline:{}".format(dockerTag)
    shell:
        "echo test > {wildcards.output_dir}/test\n"
        "python3 rank_gene_dropouts.py "
        "{input.datafile} "
        "{input.folds} "
        "{wildcards.output_dir} "
        "&> {log}"


"""
Rule for R based tools.
"""
rule singleCellNet:
  input:
    datafile = config["datafile"],
    labfile = config["labfile"],
    folds = "{output_dir}/CV_folds.RData",
    ranking = feature_ranking
  output:
    pred = "{output_dir}/singleCellNet/singleCellNet_pred.csv",
    true = "{output_dir}/singleCellNet/singleCellNet_true.csv",
    test_time = "{output_dir}/singleCellNet/singleCellNet_test_time.csv",
    training_time = "{output_dir}/singleCellNet/singleCellNet_training_time.csv"
  log: "{output_dir}/singleCellNet/singleCellNet.log"
  params:
    n_features = config.get("number_of_features", 0)
  singularity: "docker://scrnaseqbenchmark/singlecellnet:{}".format(dockerTag)
  shell:
    "Rscript Scripts/run_singleCellNet.R "
    "{input.datafile} "
    "{input.labfile} "
    "{input.folds} "
    "{wildcards.output_dir}/singleCellNet "
    "{input.ranking} "
    "{params.n_features} "
    "&> {log}"

rule scmapcell:
  input:
    datafile = config["datafile"],
    labfile = config["labfile"],
    folds = "{output_dir}/CV_folds.RData",
    ranking = feature_ranking
  output:
    pred = "{output_dir}/scmapcell/scmapcell_pred.csv",
    true = "{output_dir}/scmapcell/scmapcell_true.csv",
    test_time = "{output_dir}/scmapcell/scmapcell_test_time.csv",
    training_time = "{output_dir}/scmapcell/scmapcell_training_time.csv"
  log: "{output_dir}/scmapcell/scmapcell.log"
  params:
    n_features = config.get("number_of_features", 0)
  singularity: "docker://scrnaseqbenchmark/scmap:{}".format(dockerTag)
  shell:
    "Rscript Scripts/run_scmapcell.R "
    "{input.datafile} "
    "{input.labfile} "
    "{input.folds} "
    "{wildcards.output_dir}/scmapcell "
    "{input.ranking} "
    "{params.n_features} "
    "&> {log}"

rule scmapcluster:
  input:
    datafile = config["datafile"],
    labfile = config["labfile"],
    folds = "{output_dir}/CV_folds.RData",
    ranking = feature_ranking
  output:
    pred = "{output_dir}/scmapcluster/scmapcluster_pred.csv",
    true = "{output_dir}/scmapcluster/scmapcluster_true.csv",
    test_time = "{output_dir}/scmapcluster/scmapcluster_test_time.csv",
    training_time = "{output_dir}/scmapcluster/scmapcluster_training_time.csv"
  log: "{output_dir}/scmapcluster/scmapcluster.log"
  params:
    n_features = config.get("number_of_features", 0)
  singularity: "docker://scrnaseqbenchmark/scmap:{}".format(dockerTag)
  shell:
    "Rscript Scripts/run_scmapcluster.R "
    "{input.datafile} "
    "{input.labfile} "
    "{input.folds} "
    "{wildcards.output_dir}/scmapcluster "
    "{input.ranking} "
    "{params.n_features} "
    "&> {log}"

rule scID:
  input:
    datafile = config["datafile"],
    labfile = config["labfile"],
    folds = "{output_dir}/CV_folds.RData",
    ranking = feature_ranking
  output:
    pred = "{output_dir}/scID/scID_pred.csv",
    true = "{output_dir}/scID/scID_true.csv",
    total_time = "{output_dir}/scID/scID_total_time.csv"
  log: "{output_dir}/scID/scID.log"
  params:
    n_features = config.get("number_of_features", 0)
  singularity: "docker://scrnaseqbenchmark/scid:{}".format(dockerTag)
  shell:
    "Rscript Scripts/run_scID.R "
    "{input.datafile} "
    "{input.labfile} "
    "{input.folds} "
    "{wildcards.output_dir}/scID "
    "{input.ranking} "
    "{params.n_features} "
    "&> {log}"

rule CHETAH:
  input:
    datafile = config["datafile"],
    labfile = config["labfile"],
    folds = "{output_dir}/CV_folds.RData",
    ranking = feature_ranking
  output:
    pred = "{output_dir}/CHETAH/CHETAH_pred.csv",
    true = "{output_dir}/CHETAH/CHETAH_true.csv",
    total_time = "{output_dir}/CHETAH/CHETAH_total_time.csv"
  log: "{output_dir}/CHETAH/CHETAH.log"
  params:
    n_features = config.get("number_of_features", 0)
  singularity: "docker://scrnaseqbenchmark/chetah:{}".format(dockerTag)
  shell:
    "Rscript Scripts/run_CHETAH.R "
    "{input.datafile} "
    "{input.labfile} "
    "{input.folds} "
    "{wildcards.output_dir}/CHETAH "
    "{input.ranking} "
    "{params.n_features} "
    "&> {log}"

rule SingleR:
  input:
    datafile = config["datafile"],
    labfile = config["labfile"],
    folds = "{output_dir}/CV_folds.RData",
    ranking = feature_ranking
  output:
    pred = "{output_dir}/SingleR/SingleR_pred.csv",
    true = "{output_dir}/SingleR/SingleR_true.csv",
    total_time = "{output_dir}/SingleR/SingleR_total_time.csv"
  log: "{output_dir}/SingleR/SingleR.log"
  params:
    n_features = config.get("number_of_features", 0)
  singularity: "docker://scrnaseqbenchmark/singler:{}".format(dockerTag)
  shell:
    "Rscript Scripts/run_SingleR.R "
    "{input.datafile} "
    "{input.labfile} "
    "{input.folds} "
    "{wildcards.output_dir}/SingleR "
    "{input.ranking} "
    "{params.n_features} "
    "&> {log}"

#NOTE non-conformant to the rest of the rules.
rule Garnett_CV:
  input:
    datafile = config["datafile"],
    labfile = config["labfile"],
    folds = "{output_dir}/CV_folds.RData",
    genes_names = config.get("genes", "UNSPECIFIEDFILE"),
    markers = config.get("Garnett_CV", {}).get(
        "markers", "UNSPECIFIEDFILE")
  output:
    pred = "{output_dir}/Garnett_CV/Garnett_CV_pred.csv",
    true = "{output_dir}/Garnett_CV/Garnett_CV_true.csv",
    test_time = "{output_dir}/Garnett_CV/Garnett_CV_test_time.csv",
    training_time = "{output_dir}/Garnett_CV/Garnett_CV_training_time.csv"
  log: "{output_dir}/Garnett_CV/Garnett_CV.log"
  params:
    human = "T" if config.get("human", True) else "F"
  singularity: "docker://scrnaseqbenchmark/garnett:{}".format(dockerTag)
  shell:
    "Rscript Scripts/run_Garnett_CV.R "
    "{input.datafile} "
    "{input.labfile} "
    "{input.folds} "
    "{input.genes_names} "
    "{input.markers} "
    "{wildcards.output_dir}/Garnett_CV "
    "{params.human} "
    "&> {log}"

#NOTE non-conformant to the rest of the rules.
rule Garnett_Pretrained: #TODO test this
  input:
    datafile = config["datafile"],
    labfile = config["labfile"],
    folds = "{output_dir}/CV_folds.RData",
    genes_names = config.get("genes", "UNSPECIFIEDFILE"),
    classifier = config.get("Garnett_Pretrained", {}).get(
        "classifier", "UNSPECIFIEDFILE")
  output:
    pred = "{output_dir}/Garnett_Pretrained/Garnett_Pretrained_pred.csv",
    true = "{output_dir}/Garnett_Pretrained/Garnett_Pretrained_true.csv",
    test_time = "{output_dir}/Garnett_Pretrained/Garnett_Pretrained_test_time.csv"
  log: "{output_dir}/Garnett_Pretrained/Garnett_Pretrained.log"
  params:
    human = "T" if config.get("human", True) else "F"
  singularity: "docker://scrnaseqbenchmark/garnett:{}".format(dockerTag)
  shell:
    "Rscript Scripts/run_Garnett_Pretrained.R "
    "{input.datafile} "
    "{input.labfile} "
    "{input.genes_names} "
    "{input.folds} "
    "{input.classifier} "
    "{wildcards.output_dir}/Garnett_Pretrained "
    "{params.human} "
    "&> {log}"


"""
Rules for python based tools.
"""
rule kNN50:
  input:
    datafile = config["datafile"],
    labfile = config["labfile"],
    folds = "{output_dir}/CV_folds.RData",
    ranking = feature_ranking
  output:
    pred = "{output_dir}/kNN50/kNN50_pred.csv",
    true = "{output_dir}/kNN50/kNN50_true.csv",
    test_time = "{output_dir}/kNN50/kNN50_test_time.csv",
    training_time = "{output_dir}/kNN50/kNN50_training_time.csv"
  log: "{output_dir}/kNN50/kNN50.log"
  params:
    n_features = config.get("number_of_features", 0)
  singularity: "docker://scrnaseqbenchmark/baseline:{}".format(dockerTag)
  shell:
    "python3 Scripts/run_kNN50.py "
    "{input.datafile} "
    "{input.labfile} "
    "{input.folds} "
    "{wildcards.output_dir}/kNN50 "
    "{input.ranking} "
    "{params.n_features} "
    "&> {log}"

rule kNN9:
  input:
    datafile = config["datafile"],
    labfile = config["labfile"],
    folds = "{output_dir}/CV_folds.RData",
    ranking = feature_ranking
  output:
    pred = "{output_dir}/kNN9/kNN9_pred.csv",
    true = "{output_dir}/kNN9/kNN9_true.csv",
    test_time = "{output_dir}/kNN9/kNN9_test_time.csv",
    training_time = "{output_dir}/kNN9/kNN9_training_time.csv"
  log: "{output_dir}/kNN9/kNN9.log"
  params:
    n_features = config.get("number_of_features", 0)
  singularity: "docker://scrnaseqbenchmark/baseline:{}".format(dockerTag)
  shell:
    "python3 Scripts/run_kNN9.py "
    "{input.datafile} "
    "{input.labfile} "
    "{input.folds} "
    "{wildcards.output_dir}/kNN9 "
    "{input.ranking} "
    "{params.n_features} "
    "&> {log}"

rule Cell_BLAST:
  input:
    datafile = config["datafile"],
    labfile = config["labfile"],
    folds = "{output_dir}/CV_folds.RData",
    ranking = feature_ranking
  output:
    pred = "{output_dir}/Cell_BLAST/Cell_BLAST_pred.csv",
    true = "{output_dir}/Cell_BLAST/Cell_BLAST_true.csv",
    test_time = "{output_dir}/Cell_BLAST/Cell_BLAST_test_time.csv",
    training_time = "{output_dir}/Cell_BLAST/Cell_BLAST_training_time.csv"
  log: "{output_dir}/Cell_BLAST/Cell_BLAST.log"
  params:
    n_features = config.get("number_of_features", 0)
  singularity: "docker://scrnaseqbenchmark/cell_blast:{}".format(dockerTag)
  shell:
    "python3 Scripts/run_Cell_BLAST.py "
    "{input.datafile} "
    "{input.labfile} "
    "{input.folds} "
    "{wildcards.output_dir}/Cell_BLAST "
    "{input.ranking} "
    "{params.n_features} "
    "&> {log}"

rule scVI:
  input:
    datafile = config["datafile"],
    labfile = config["labfile"],
    folds = "{output_dir}/CV_folds.RData",
    ranking = feature_ranking
  output:
    pred = "{output_dir}/scVI/scVI_pred.csv",
    true = "{output_dir}/scVI/scVI_true.csv",
    test_time = "{output_dir}/scVI/scVI_test_time.csv",
    training_time = "{output_dir}/scVI/scVI_training_time.csv"
  log: "{output_dir}/scVI/scVI.log"
  params:
    n_features = config.get("number_of_features", 0)
  singularity: "docker://scrnaseqbenchmark/scvi:{}".format(dockerTag)
  shell:
    "python3 Scripts/run_scVI.py "
    "{input.datafile} "
    "{input.labfile} "
    "{input.folds} "
    "{wildcards.output_dir}/scVI "
    "{input.ranking} "
    "{params.n_features} "
    "&> {log}"

rule LDA:
  input:
    datafile = config["datafile"],
    labfile = config["labfile"],
    folds = "{output_dir}/CV_folds.RData",
    ranking = feature_ranking
  output:
    pred = "{output_dir}/LDA/LDA_pred.csv",
    true = "{output_dir}/LDA/LDA_true.csv",
    test_time = "{output_dir}/LDA/LDA_test_time.csv",
    training_time = "{output_dir}/LDA/LDA_training_time.csv"
  log: "{output_dir}/LDA/LDA.log"
  params:
    n_features = config.get("number_of_features", 0)
  singularity: "docker://scrnaseqbenchmark/baseline:{}".format(dockerTag)
  shell:
    "python3 Scripts/run_LDA.py "
    "{input.datafile} "
    "{input.labfile} "
    "{input.folds} "
    "{wildcards.output_dir}/LDA "
    "{input.ranking} "
    "{params.n_features} "
    "&> {log}"

rule LDA_rejection:
  input:
    datafile = config["datafile"],
    labfile = config["labfile"],
    folds = "{output_dir}/CV_folds.RData",
    ranking = feature_ranking
  output:
    pred = "{output_dir}/LDA_rejection/LDA_rejection_pred.csv",
    true = "{output_dir}/LDA_rejection/LDA_rejection_true.csv",
    test_time = "{output_dir}/LDA_rejection/LDA_rejection_test_time.csv",
    training_time = "{output_dir}/LDA_rejection/LDA_rejection_training_time.csv"
  log: "{output_dir}/LDA_rejection/LDA_rejection.log"
  params:
    n_features = config.get("number_of_features", 0)
  singularity: "docker://scrnaseqbenchmark/baseline:{}".format(dockerTag)
  shell:
    "python3 Scripts/run_LDA_rejection.py "
    "{input.datafile} "
    "{input.labfile} "
    "{input.folds} "
    "{wildcards.output_dir}/LDA_rejection "
    "{input.ranking} "
    "{params.n_features} "
    "&> {log}"

rule NMC:
  input:
    datafile = config["datafile"],
    labfile = config["labfile"],
    folds = "{output_dir}/CV_folds.RData",
    ranking = feature_ranking
  output:
    pred = "{output_dir}/NMC/NMC_pred.csv",
    true = "{output_dir}/NMC/NMC_true.csv",
    test_time = "{output_dir}/NMC/NMC_test_time.csv",
    training_time = "{output_dir}/NMC/NMC_training_time.csv"
  log: "{output_dir}/NMC/NMC.log"
  params:
    n_features = config.get("number_of_features", 0)
  singularity: "docker://scrnaseqbenchmark/baseline:{}".format(dockerTag)
  shell:
    "python3 Scripts/run_NMC.py "
    "{input.datafile} "
    "{input.labfile} "
    "{input.folds} "
    "{wildcards.output_dir}/NMC "
    "{input.ranking} "
    "{params.n_features} "
    "&> {log}"

rule RF:
  input:
    datafile = config["datafile"],
    labfile = config["labfile"],
    folds = "{output_dir}/CV_folds.RData",
    ranking = feature_ranking
  output:
    pred = "{output_dir}/RF/RF_pred.csv",
    true = "{output_dir}/RF/RF_true.csv",
    test_time = "{output_dir}/RF/RF_test_time.csv",
    training_time = "{output_dir}/RF/RF_training_time.csv"
  log: "{output_dir}/RF/RF.log"
  params:
    n_features = config.get("number_of_features", 0)
  singularity: "docker://scrnaseqbenchmark/baseline:{}".format(dockerTag)
  shell:
    "python3 Scripts/run_RF.py "
    "{input.datafile} "
    "{input.labfile} "
    "{input.folds} "
    "{wildcards.output_dir}/RF "
    "{input.ranking} "
    "{params.n_features} "
    "&> {log}"

rule SVM:
  input:
    datafile = config["datafile"],
    labfile = config["labfile"],
    folds = "{output_dir}/CV_folds.RData",
    ranking = feature_ranking
  output:
    pred = "{output_dir}/SVM/SVM_pred.csv",
    true = "{output_dir}/SVM/SVM_true.csv",
    test_time = "{output_dir}/SVM/SVM_test_time.csv",
    training_time = "{output_dir}/SVM/SVM_training_time.csv"
  log: "{output_dir}/SVM/SVM.log"
  params:
    n_features = config.get("number_of_features", 0)
  singularity: "docker://scrnaseqbenchmark/baseline:{}".format(dockerTag)
  shell:
    "python3 Scripts/run_SVM.py "
    "{input.datafile} "
    "{input.labfile} "
    "{input.folds} "
    "{wildcards.output_dir}/SVM "
    "{input.ranking} "
    "{params.n_features} "
    "&> {log}"

rule SVM_rejection:
  input:
    datafile = config["datafile"],
    labfile = config["labfile"],
    folds = "{output_dir}/CV_folds.RData",
    ranking = feature_ranking
  output:
    pred = "{output_dir}/SVM_rejection/SVM_rejection_pred.csv",
    true = "{output_dir}/SVM_rejection/SVM_rejection_true.csv",
    test_time = "{output_dir}/SVM_rejection/SVM_rejection_test_time.csv",
    training_time = "{output_dir}/SVM_rejection/SVM_rejection_training_time.csv"
  log: "{output_dir}/SVM_rejection/SVM_rejection.log"
  params:
    n_features = config.get("number_of_features", 0)
  singularity: "docker://scrnaseqbenchmark/baseline:{}".format(dockerTag)
  shell:
    "python3 Scripts/run_SVM_rejection.py "
    "{input.datafile} "
    "{input.labfile} "
    "{input.folds} "
    "{wildcards.output_dir}/SVM_rejection "
    "{input.ranking} "
    "{params.n_features} "
    "&> {log}"


================================================
FILE: Snakemake/evaluate.R
================================================
args <- commandArgs(TRUE)

TrueLabelsPath <- args[1]
PredLabelsPath <- args[2]
OutputDir <- args[3]
ToolName <- args[4]

evaluate <- function(TrueLabelsPath, PredLabelsPath, Indices = NULL){
  "
  Script to evaluate the performance of the classifier.
  It returns multiple evaluation measures: the confusion matrix, median F1-score, F1-score for each class, accuracy, percentage of unlabeled, population size. 
  
  The percentage of unlabeled cells is find by checking for cells that are labeled 'Unassigned', 'unassigned', 'Unknown', 'unknown', 'Nodexx', 'rand', or 'ambiguous'.
  
  Parameters
  ----------
  TrueLabelsPath: csv file with the true labels (format: one column, no index)
  PredLabelsPath: csv file with the predicted labels (format: one column, no index)
  Indices: which part of the csv file should be read (e.g. if more datasets are tested at the same time) (format: c(begin, end))
  
  Returns
  -------
  Conf: confusion matrix
  MedF1 : median F1-score
  F1 : F1-score per class
  Acc : accuracy
  PercUnl : percentage of unlabeled cells
  PopSize : number of cells per cell type
  "
  
  true_lab <- unlist(read.csv(TrueLabelsPath))
  pred_lab <- unlist(read.csv(PredLabelsPath))
  
  if (! is.null(Indices)){
    true_lab <- true_lab[Indices]
    pred_lab <- pred_lab[Indices]
  }
  
  unique_true <- unlist(unique(true_lab))
  unique_pred <- unlist(unique(pred_lab))
  
  unique_all <- unique(c(unique_true,unique_pred))
  conf <- table(true_lab,pred_lab)
  pop_size <- rowSums(conf)
  
  pred_lab = gsub('Node..','Node',pred_lab)
  
  conf_F1 <- table(true_lab,pred_lab,exclude = c('unassigned','Unassigned','Unknown','rand','Node','ambiguous','unknown'))

  F1 <- vector()
  sum_acc <- 0
  
  for (i in c(1:length(unique_true))){
    findLabel = colnames(conf_F1) == row.names(conf_F1)[i]
    if(sum(findLabel)){
      prec <- conf_F1[i,findLabel] / colSums(conf_F1)[findLabel]
      rec <- conf_F1[i,findLabel] / rowSums(conf_F1)[i]
      if (prec == 0 || rec == 0){
        F1[i] = 0
      } else{
        F1[i] <- (2*prec*rec) / (prec + rec)
      }
      sum_acc <- sum_acc + conf_F1[i,findLabel]
    } else {
      F1[i] = 0
    }
  }
  
  pop_size <- pop_size[pop_size > 0]
  
  names(F1) <- names(pop_size)
  
  med_F1 <- median(F1)
  
  total <- length(pred_lab)
  num_unlab <- sum(pred_lab == 'unassigned') + sum(pred_lab == 'Unassigned') + sum(pred_lab == 'rand') + sum(pred_lab == 'Unknown') + sum(pred_lab == 'unknown') + sum(pred_lab == 'Node') + sum(pred_lab == 'ambiguous')
  per_unlab <- num_unlab / total
  
  acc <- sum_acc/sum(conf_F1)
  
  result <- list(Conf = conf, MedF1 = med_F1, F1 = F1, Acc = acc, PercUnl = per_unlab, PopSize = pop_size)
  
  return(result)
}

results <- evaluate(TrueLabelsPath, PredLabelsPath)
write.csv(results$Conf, file.path(OutputDir, "Confusion", paste0(ToolName, ".csv")))
write.csv(results$F1, file.path(OutputDir, "F1", paste0(ToolName, ".csv")))
write.csv(results$PopSize, file.path(OutputDir, "PopSize", paste0(ToolName, ".csv")))
df <- data.frame(results[c("MedF1", "Acc", "PercUnl")])
write.csv(df, file.path(OutputDir, "Summary", paste0(ToolName, ".csv")))


================================================
FILE: Snakemake/example.config.yml
================================================
output_dir: output
datafile: input/data.csv
labfile: input/Labels.csv
column: 1
number_of_features: 0
tools_to_run:
  - Cell_BLAST
  - scVI
  - scmapcell 


================================================
FILE: Snakemake/rank_gene_dropouts.py
================================================
import os
from sys import argv
from pathlib import Path

import rpy2.robjects as robjects
import numpy as np
import pandas as pd
from sklearn import linear_model


def rank_gene_dropouts(DataPath, CV_RDataPath, OutputDir):
    '''
    Script to rank the genes in the training set of the inputfile based on their dropout level.
    This rank is written to a file.

    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    '''

    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    data = data.iloc[tokeep]
    data = np.log2(data+1)

    genes = np.zeros([np.shape(data)[1],np.squeeze(nfolds)], dtype = '>U10')

    for i in range(np.squeeze(nfolds)):
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
        train=data.iloc[train_ind_i]
        train.columns = np.arange(len(train.columns))

        # rank genes training set
        dropout = (train == 0).sum(axis='rows')
        dropout = (dropout / train.shape[0]) * 100
        mean = train.mean(axis='rows')

        notzero = np.where((np.array(mean) > 0) & (np.array(dropout) > 0))[0]
        zero = np.where(~((np.array(mean) > 0) & (np.array(dropout) > 0)))[0]
        train_notzero = train.iloc[:,notzero]
        train_zero = train.iloc[:,zero]
        zero_genes = train_zero.columns

        dropout = dropout.iloc[notzero]
        mean = mean.iloc[notzero]

        dropout = np.log2(np.array(dropout)).reshape(-1,1)
        mean = np.array(mean).reshape(-1,1)
        reg = linear_model.LinearRegression()
        reg.fit(mean,dropout)

        residuals = dropout - reg.predict(mean)
        residuals = pd.Series(np.array(residuals).ravel(),index=train_notzero.columns)
        residuals = residuals.sort_values(ascending=False)
        sorted_genes = residuals.index
        sorted_genes = sorted_genes.append(zero_genes)

        genes[:,i] = sorted_genes.values


    genes = pd.DataFrame(genes)

    genes.to_csv(str(OutputDir / Path("rank_genes_dropouts.csv")), index = False)

rank_gene_dropouts(argv[1], argv[2], argv[3])


================================================
FILE: evaluate.R
================================================
evaluate <- function(TrueLabelsPath, PredLabelsPath, Indices = NULL){
  "
  Script to evaluate the performance of the classifier.
  It returns multiple evaluation measures: the confusion matrix, median F1-score, F1-score for each class, accuracy, percentage of unlabeled, population size. 
  
  The percentage of unlabeled cells is find by checking for cells that are labeled 'Unassigned', 'unassigned', 'Unknown', 'unknown', 'Nodexx', 'rand', or 'ambiguous'.
  
  Parameters
  ----------
  TrueLabelsPath: csv file with the true labels (format: one column, no index)
  PredLabelsPath: csv file with the predicted labels (format: one column, no index)
  Indices: which part of the csv file should be read (e.g. if more datasets are tested at the same time) (format: c(begin, end))
  
  Returns
  -------
  Conf: confusion matrix
  MedF1 : median F1-score
  F1 : F1-score per class
  Acc : accuracy
  PercUnl : percentage of unlabeled cells
  PopSize : number of cells per cell type
  "
  
  true_lab <- unlist(read.csv(TrueLabelsPath))
  pred_lab <- unlist(read.csv(PredLabelsPath))
  
  if (! is.null(Indices)){
    true_lab <- true_lab[Indices]
    pred_lab <- pred_lab[Indices]
  }
  
  unique_true <- unlist(unique(true_lab))
  unique_pred <- unlist(unique(pred_lab))
  
  unique_all <- unique(c(unique_true,unique_pred))
  conf <- table(true_lab,pred_lab)
  pop_size <- rowSums(conf)
  
  pred_lab = gsub('Node..','Node',pred_lab)
  
  conf_F1 <- table(true_lab,pred_lab,exclude = c('unassigned','Unassigned','Unknown','rand','Node','ambiguous','unknown'))

  F1 <- vector()
  sum_acc <- 0
  
  for (i in c(1:length(unique_true))){
    findLabel = colnames(conf_F1) == row.names(conf_F1)[i]
    if(sum(findLabel)){
      prec <- conf_F1[i,findLabel] / colSums(conf_F1)[findLabel]
      rec <- conf_F1[i,findLabel] / rowSums(conf_F1)[i]
      if (prec == 0 || rec == 0){
        F1[i] = 0
      } else{
        F1[i] <- (2*prec*rec) / (prec + rec)
      }
      sum_acc <- sum_acc + conf_F1[i,findLabel]
    } else {
      F1[i] = 0
    }
  }
  
  pop_size <- pop_size[pop_size > 0]
  
  names(F1) <- names(pop_size)
  
  med_F1 <- median(F1)
  
  total <- length(pred_lab)
  num_unlab <- sum(pred_lab == 'unassigned') + sum(pred_lab == 'Unassigned') + sum(pred_lab == 'rand') + sum(pred_lab == 'Unknown') + sum(pred_lab == 'unknown') + sum(pred_lab == 'Node') + sum(pred_lab == 'ambiguous')
  per_unlab <- num_unlab / total
  
  acc <- sum_acc/sum(conf_F1)
  
  result <- list(Conf = conf, MedF1 = med_F1, F1 = F1, Acc = acc, PercUnl = per_unlab, PopSize = pop_size)
  
  return(result)
}


================================================
FILE: rank_gene_dropouts.py
================================================
import os
import rpy2.robjects as robjects
import numpy as np
import pandas as pd
from sklearn import linear_model


def rank_gene_dropouts(DataPath, CV_RDataPath, OutputDir):
    '''
    Script to rank the genes in the training set of the inputfile based on their dropout level.
    This rank is written to a file.
    
    Parameters 
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    '''
        
    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    train_ind = np.array(robjects.r['Train_Idx'])
    
    # read the data
    data = pd.read_csv(DataPath,index_col=0,sep=',')
    data = data.iloc[tokeep]
    data = np.log2(data+1)
    
    genes = np.zeros([np.shape(data)[1],np.squeeze(nfolds)], dtype = '>U10')
        
    for i in range(np.squeeze(nfolds)):
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
        train=data.iloc[train_ind_i]
        train.columns = np.arange(len(train.columns))
            
        # rank genes training set 
        dropout = (train == 0).sum(axis='rows')
        dropout = (dropout / train.shape[0]) * 100
        mean = train.mean(axis='rows')
            
        notzero = np.where((np.array(mean) > 0) & (np.array(dropout) > 0))[0]
        zero = np.where(~((np.array(mean) > 0) & (np.array(dropout) > 0)))[0]
        train_notzero = train.iloc[:,notzero]
        train_zero = train.iloc[:,zero]
        zero_genes = train_zero.columns
            
        dropout = dropout.iloc[notzero]
        mean = mean.iloc[notzero]
    
        dropout = np.log2(np.array(dropout)).reshape(-1,1)
        mean = np.array(mean).reshape(-1,1)
        reg = linear_model.LinearRegression()
        reg.fit(mean,dropout)
    
        residuals = dropout - reg.predict(mean)
        residuals = pd.Series(np.array(residuals).ravel(),index=train_notzero.columns)
        residuals = residuals.sort_values(ascending=False)
        sorted_genes = residuals.index
        sorted_genes = sorted_genes.append(zero_genes)
            
        genes[:,i] = sorted_genes.values
            
    
    genes = pd.DataFrame(genes)
    
    os.chdir(OutputDir)
    genes.to_csv("rank_genes_dropouts.csv", index = False)