Repository: tabdelaal/scRNAseq_Benchmark
Branch: master
Commit: 553869b632f4
Files: 82
Total size: 288.4 KB
Directory structure:
gitextract_ikyozzhh/
├── Cross_Validation.R
├── DEgenesMAST.R
├── LICENSE
├── README.md
├── Scripts/
│ ├── run_ACTINN.py
│ ├── run_CHETAH.R
│ ├── run_CaSTLe.R
│ ├── run_Cell_BLAST.py
│ ├── run_DigitalCellSorter.py
│ ├── run_Garnett_CV.R
│ ├── run_Garnett_Pretrained.R
│ ├── run_LAmbDA.py
│ ├── run_LDA.py
│ ├── run_LDA_rejection.py
│ ├── run_NMC.py
│ ├── run_RF.py
│ ├── run_SCINA.R
│ ├── run_SVM.py
│ ├── run_SVM_rejection.py
│ ├── run_SingleR.R
│ ├── run_kNN50.py
│ ├── run_kNN9.py
│ ├── run_moana.py
│ ├── run_scID.R
│ ├── run_scPred.R
│ ├── run_scVI.py
│ ├── run_scmap.R
│ └── run_singleCellNet.R
├── Snakemake/
│ ├── Cross_Validation.R
│ ├── DEgenesMAST.R
│ ├── Dockerfiles/
│ │ ├── baseline/
│ │ │ └── Dockerfile
│ │ ├── cell_blast/
│ │ │ └── Dockerfile
│ │ ├── chetah/
│ │ │ ├── Dockerfile
│ │ │ └── install_packages.R
│ │ ├── cross_validation/
│ │ │ ├── Dockerfile
│ │ │ └── install_packages.R
│ │ ├── garnett/
│ │ │ ├── Dockerfile
│ │ │ └── install_packages.R
│ │ ├── scid/
│ │ │ ├── Dockerfile
│ │ │ └── install_packages.R
│ │ ├── scmap/
│ │ │ ├── Dockerfile
│ │ │ └── install_packages.R
│ │ ├── scvi/
│ │ │ └── Dockerfile
│ │ ├── singlecellnet/
│ │ │ ├── Dockerfile
│ │ │ └── install_packages.R
│ │ └── singler/
│ │ ├── Dockerfile
│ │ └── install_packages.R
│ ├── LICENSE
│ ├── README.md
│ ├── Scripts/
│ │ ├── run_ACTINN.py
│ │ ├── run_CHETAH.R
│ │ ├── run_CaSTLe.R
│ │ ├── run_Cell_BLAST.py
│ │ ├── run_DigitalCellSorter.py
│ │ ├── run_Garnett_CV.R
│ │ ├── run_Garnett_Pretrained.R
│ │ ├── run_LAmbDA.py
│ │ ├── run_LDA.py
│ │ ├── run_LDA_rejection.py
│ │ ├── run_NMC.py
│ │ ├── run_RF.py
│ │ ├── run_SCINA.R
│ │ ├── run_SVM.py
│ │ ├── run_SVM_rejection.py
│ │ ├── run_SingleR.R
│ │ ├── run_kNN50.py
│ │ ├── run_kNN9.py
│ │ ├── run_moana.py
│ │ ├── run_scID.R
│ │ ├── run_scPred.R
│ │ ├── run_scVI.py
│ │ ├── run_scmap.R
│ │ ├── run_scmapcell.R
│ │ ├── run_scmapcluster.R
│ │ ├── run_scmaptotal.R
│ │ └── run_singleCellNet.R
│ ├── Snakefile
│ ├── evaluate.R
│ ├── example.config.yml
│ └── rank_gene_dropouts.py
├── evaluate.R
└── rank_gene_dropouts.py
================================================
FILE CONTENTS
================================================
================================================
FILE: Cross_Validation.R
================================================
Cross_Validation <- function(LabelsPath, col_Index = 1,OutputDir){
"
Cross_Validation
Function returns train and test indices for 5 folds stratified across unique cell populations,
also filter out cell populations with less than 10 cells.
It return a 'CV_folds.RData' file which then used as input to classifiers wrappers.
Parameters
----------
LabelsPath : Cell population annotations file path (.csv).
col_Index : column index (integer) defining which level of annotation to use,
in case of multiple cell type annotations (default is 1)
OutputDir : Output directory defining the path of the exported file.
"
Labels <- as.matrix(read.csv(LabelsPath))
Labels <- as.vector(Labels[,col_Index])
Removed_classes <- !(table(Labels) > 10)
Cells_to_Keep <- !(is.element(Labels,names(Removed_classes)[Removed_classes]))
Labels <- Labels[Cells_to_Keep]
# Getting training and testing Folds
library(rBayesianOptimization)
n_folds = 5
Folds <- KFold(Labels,nfolds = n_folds, stratified = TRUE)
Test_Folds <- c(n_folds:1)
Train_Idx <- list()
Test_Idx <- list()
for (i in c(1:length(Folds))){
Temp_Folds <- Folds
Temp_Folds[Test_Folds[i]] <- NULL
Train_Idx[i] <- list(unlist(Temp_Folds))
Test_Idx[i] <- Folds[Test_Folds[i]]
}
remove(Temp_Folds,i,Folds)
setwd(OutputDir)
save(n_folds,Train_Idx,Test_Idx,col_Index,Cells_to_Keep,file = 'CV_folds.RData')
}
================================================
FILE: DEgenesMAST.R
================================================
DEgenesMAST <- function(Data, Labels, Normalize = FALSE, LogTransform = FALSE){
# This functions applies a differential expression test to the data using one vs all
# The training data should be used a an input
# The output is a matrix with marker genes where the columns are the cell populations and the rows are the top20 marker genes
# This output can be rewritten to the format of the prior-knowledge-supervised classifiers and afterwards be used to classify the test set.
# Data: genes X cells (rows = genes, columns = cells)
# Labels: labels of the data
# Normalize: the input for MAST should be cpm normalized data,
# if the data is not normalized yet, this should be set to TRUE
# LogTransform: the input for MAST should be logtransformed,
# if the data is not logtransformed yet, this should be set to TRUE
library(Seurat)
if(Normalize)
{
Data <- apply(Data, 2, function(x) (x/sum(x))*1000000)
}
if(LogTransform)
{
Data <- log(Data+1, base = 2)
}
SeuObj <- CreateSeuratObject(raw.data = Data, project = "DEgenes")
SeuObj <- SetIdent(SeuObj, ident.use = Labels)
DEgenes <- FindAllMarkers(SeuObj, test.use = "MAST")
Markers <- matrix(nrow = 20,ncol = length(unique(Labels)))
colnames(Markers) <- unique(Labels)
for (i in unique(Labels)){
i
TempList <- DEgenes$gene[((DEgenes$cluster == i) & (DEgenes$avg_logFC > 0))]
MarkerGenes <- DEgenes$p_val_adj[DEgenes$cluster == i]
print(MarkerGenes[1:20])
if (length(TempList) >= 20){
Markers[,i] <- TempList[1:20]
}
else{
if(length(TempList) > 0){
Markers[c(1:length(TempList)),i] <- TempList
}
}
}
return(Markers)
}
================================================
FILE: LICENSE
================================================
MIT License
Copyright (c) 2019 tabdelaal
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.md
================================================
# A comparison of automatic cell identification methods for single-cell RNA-sequencing data
We present a comprehensive evaluation of the performance of state-of-the-art classification methods, in addition to general-purpose classifiers, for automatic cell identification single cell RNA-sequencing datasets. Our goal is to provide the community with a fair evaluation of all available methods to facilitate the users’ choice as well as direct further developments to focus on the challenging aspects of automated cell type identification. (published in genome biology Sep. 2019 https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1795-z)
### Repository description
We provide all the scripts to run and evaluate all classifiers, and to reproduce the results introduced in the paper.
1. 'Scripts' folder contains a wrapper function to read the data and apply certain classification method.
2. ```Cross_Validation``` R script can be used to produce training and test indices for cross validation.
3. ```rank_gene_dropouts``` Python script can be used to apply feature selection using the dropout method, and rank genes accordingly.
4. ```evaluate``` R script can be used to evaluate the prediction of a certain classifier and obtain scores such as accuracy, median F1-score and % unlabeld cells.
For more details, please check function documentations.
### General Usage
To benchmark and fairly evaluate the performance of different classifiers using benchmark-datasets (Filtered datasets can be downloaded from https://zenodo.org/record/3357167), apply the following steps:
#### Step 1
Apply the ```Cross_Validation``` R function on the corresponding dataset to obtain fixed training and test cell indices, straitified across different cell types. For example, using the Tabula Muris (TM) dataset
```R
Cross_Validation('~/TM/Labels.csv', 1, '~/TM/')
```
This command will create a ```CV_folds.RData``` file used as input in Step 2.
#### Step 2
Run each classifier wrapper. For example, running scPred on TM dataset
```R
run_scPred('~/TM/Filtered_TM_data.csv','~/TM/Labels.csv','~/TM/CV_folds.RData','~/Results/TM/')
```
This command will output the true and predicted cell labels as csv files, as well as the classifier computation time.
#### Step 3
Evaluate the classifier prediction by
```R
result <- evaluate('~/Results/TM/scPred_True_Labels.csv', '~/Results/TM/scPred_Pred_Labels.csv')
```
This command will return the corresponding accuracy, median F1-score, F1-scores for all cell populations, % unlabeled cells, and confusion matrix.
### Usage with feature selection
#### Step 1
Apply the ```Cross_Validation``` R function on the corresponding dataset to obtain fixed training and test cell indices, straitified across different cell types. For example, using the Tabula Muris (TM) dataset
```R
Cross_Validation('~/TM/Labels.csv', 1, '~/TM/')
```
This command will create a ```CV_folds.RData``` file used as input in Step 2 and 3.
#### Step 2
Apply the ```rank_gene_dropouts``` Python script to get the genes ranking for each training fold using the dropout criteria
```
rank_gene_dropouts('~/TM/Filtered_TM_data.csv', '~/TM/CV_folds.RData', '~/TM/')
```
This command will create a ```rank_genes_dropouts.csv``` file used as input in Step 3.
#### Step 3
Run each classifier wrapper. For example, running scPred on TM dataset with 1000 genes
```R
run_scPred('~/TM/Filtered_TM_data.csv','~/TM/Labels.csv','~/TM/CV_folds.RData','~/Results/TM/',
GeneOrderPath = '~/TM/rank_genes_dropouts.csv',NumGenes = 1000)
```
This command will output the true and predicted cell labels as csv files, as well as the classifier computation time.
#### Step 4
Evaluate the classifier prediction by
```R
result <- evaluate('~/Results/TM/scPred_True_Labels.csv', '~/Results/TM/scPred_Pred_Labels.csv')
```
This command will return the corresponding accuracy, median F1-score, F1-scores for all cell populations, % unlabeled cells, and confusion matrix.
### Evaluate Marker-based methods using DE genes
To evaluate the marker-based methods SCINA, DigitalCellSorter and Garnett using DE genes learned from the data, you may follow these steps:
#### Step 1
Apply the ```Cross_Validation``` R function on the corresponding dataset to obtain fixed training and test cell indices, straitified across different cell types. For example, using the Zheng_sorted dataset
```R
Cross_Validation('~/TM/Labels.csv', 1, '~/Zheng_sorted/')
```
This command will create a ```CV_folds.RData``` file used as input in Step 2 and 3.
#### Step 2
For each fold use the training data to get the DE genes using the ```DEgenesMAST``` R function, and pass these DE genes to the corresponding method, for example here we use SCINA, to obtain cell prediction for the test data.
```R
load('CV_folds.RData')
Data <- read.csv('~/Zheng_sorted/Filtered_DownSampled_SortedPBMC_data',row.names = 1)
Labels <- as.matrix(read.csv('~/Zheng_sorted/Labels.csv'))
Labels <- as.vector(Labels[,col_Index])
Data <- Data[Cells_to_Keep,]
Labels <- Labels[Cells_to_Keep]
for (i in c(1:n_folds))
{
MarkerGenes <- DEgenesMAST(t(Data[Train_Idx[[i]],]), Labels[Train_Idx[[i]]], Normalize = TRUE, LogTransform = TRUE)
## write the MarkerGenes into a marker genes file format, depending on the tested method, for example for SCINA
write.csv(MarkerGenes, 'MarkerGenes.csv')
## run the SCINA wrapper using these DE marker genes
run_SCINA(Data[Test_Idx[[i]],], Labels[Test_Idx[[i]]], 'MarkerGenes.csv', '~/Results/Zheng_sorted/')
}
```
### Snakemake
To support future extension of this benchmarking work with new classifiers and datasets, we provide a Snakemake workflow to automate the performed benchmarking analyses (https://github.com/tabdelaal/scRNAseq_Benchmark/tree/snakemake_and_docker).
================================================
FILE: Scripts/run_ACTINN.py
================================================
import os
import numpy as np
import pandas as pd
import time as tm
import rpy2.robjects as robjects
def run_ACTINN(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run ACTINN
Wrapper script to run ACTINN on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
labels = labels.iloc[tokeep]
data = data.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
# folder with results
os.chdir(OutputDir)
tot=[]
truelab = []
pred = []
for i in range(np.squeeze(nfolds)):
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
train=data.iloc[train_ind_i]
test=data.iloc[test_ind_i]
y_train=labels.iloc[train_ind_i]
y_test=labels.iloc[test_ind_i]
if (NumGenes > 0):
feat_to_use = features.iloc[0:NumGenes,i]
train = train.iloc[:,feat_to_use]
test = test.iloc[:,feat_to_use]
train = train.transpose()
test = test.transpose()
train.to_csv("train.csv")
test.to_csv("test.csv")
y_train.to_csv("train_lab.csv", header = False, index = True, sep = '\t')
y_test.to_csv("test_lab.csv", header = False, index = True, sep = '\t')
tm.sleep(60)
os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_format.py -i train.csv -o train -f csv")
os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_format.py -i test.csv -o test -f csv")
start = tm.time()
os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_predict.py -trs train.h5 -trl train_lab.csv -ts test.h5")
tot.append(tm.time()-start)
tm.sleep(60)
truelab.extend(y_test.values)
predlabels = pd.read_csv('predicted_label.txt',header=0,index_col=None, sep='\t', usecols = [1])
pred.extend(predlabels.values)
truelab = pd.DataFrame(truelab)
pred = pd.DataFrame(pred)
tot_time = pd.DataFrame(tot)
if (NumGenes == 0):
truelab.to_csv("ACTINN_True_Labels.csv", index = False)
pred.to_csv("ACTINN_Pred_Labels.csv", index = False)
tot_time.to_csv("ACTINN_Total_Time.csv", index = False)
else:
truelab.to_csv("ACTINN_" + str(NumGenes) + "_True_Labels.csv", index = False)
pred.to_csv("ACTINN_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
tot_time.to_csv("ACTINN_" + str(NumGenes) + "_Total_Time.csv", index = False)
================================================
FILE: Scripts/run_CHETAH.R
================================================
run_CHETAH<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
"
run CHETAH
Wrapper script to run CHETAH on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
"
Data <- read.csv(DataPath,row.names = 1)
Labels <- as.matrix(read.csv(LabelsPath))
load(CV_RDataPath)
Labels <- as.vector(Labels[,col_Index])
Data <- Data[Cells_to_Keep,]
Labels <- Labels[Cells_to_Keep]
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
GenesOrder = read.csv(GeneOrderPath)
}
#############################################################################
# CHETAH #
#############################################################################
library(CHETAH)
library(SingleCellExperiment)
True_Labels_CHETAH <- list()
Pred_Labels_CHETAH <- list()
Total_Time_CHETAH <- list()
Data = t(as.matrix(Data))
for (i in c(1:n_folds)){
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
sce <- SingleCellExperiment(assays = list(counts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]),
colData = data.frame(celltypes = Labels[Train_Idx[[i]]]))
sce_test <- SingleCellExperiment(assays = list(counts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]),
colData = data.frame(celltypes = Labels[Test_Idx[[i]]]))
start_time <- Sys.time()
sce_test <- CHETAHclassifier(input = sce_test, ref_cells = sce, n_genes = NumGenes)
end_time <- Sys.time()
}
else{
sce <- SingleCellExperiment(assays = list(counts = Data[,Train_Idx[[i]]]),
colData = data.frame(celltypes = Labels[Train_Idx[[i]]]))
sce_test <- SingleCellExperiment(assays = list(counts = Data[,Test_Idx[[i]]]),
colData = data.frame(celltypes = Labels[Test_Idx[[i]]]))
start_time <- Sys.time()
sce_test <- CHETAHclassifier(input = sce_test, ref_cells = sce)
end_time <- Sys.time()
}
Total_Time_CHETAH[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
True_Labels_CHETAH[i] <- list(Labels[Test_Idx[[i]]])
Pred_Labels_CHETAH[i] <- list(sce_test$celltype_CHETAH)
}
True_Labels_CHETAH <- as.vector(unlist(True_Labels_CHETAH))
Pred_Labels_CHETAH <- as.vector(unlist(Pred_Labels_CHETAH))
Total_Time_CHETAH <- as.vector(unlist(Total_Time_CHETAH))
setwd(OutputDir)
if (!is.null(GeneOrderPath) & !is.null (NumGenes)){
write.csv(True_Labels_CHETAH,paste('CHETAH_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
write.csv(Pred_Labels_CHETAH,paste('CHETAH_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
write.csv(Total_Time_CHETAH,paste('CHETAH_',NumGenes,'_Total_Time.csv', sep = ''),row.names = FALSE)
}
else{
write.csv(True_Labels_CHETAH,'CHETAH_True_Labels.csv',row.names = FALSE)
write.csv(Pred_Labels_CHETAH,'CHETAH_Pred_Labels.csv',row.names = FALSE)
write.csv(Total_Time_CHETAH,'CHETAH_Total_Time.csv',row.names = FALSE)
}
}
================================================
FILE: Scripts/run_CaSTLe.R
================================================
run_CaSTLe<-function(DataPath,LabelsPath,CV_RDataPath, OutputDir, GeneOrderPath = NULL, NumGenes = NULL){
"
run CaSTLe
Wrapper script to run CaSTLe on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
"
Data <- read.csv(DataPath,row.names = 1)
Labels <- as.matrix(read.csv(LabelsPath))
load(CV_RDataPath)
Labels <- as.vector(Labels[,col_Index])
Data <- Data[Cells_to_Keep,]
Labels <- Labels[Cells_to_Keep]
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
GenesOrder = read.csv(GeneOrderPath)
}
#############################################################################
# CaSTLe #
#############################################################################
library(igraph)
library(xgboost)
True_Labels_Castle <- list()
Pred_Labels_Castle <- list()
Training_Time_Castle <- list()
Testing_Time_Castle <- list()
BREAKS=c(-1, 0, 1, 6, Inf)
nFeatures = 100
for(i in c(1:n_folds)){
# 1. Load datasets
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
ds1 = Data[Train_Idx[[i]],as.vector(GenesOrder[c(1:NumGenes),i])+1]
ds2 = Data[Test_Idx[[i]],as.vector(GenesOrder[c(1:NumGenes),i])+1]
}
else{
ds1 = Data[Train_Idx[[i]],]
ds2 = Data[Test_Idx[[i]],]
}
sourceCellTypes = as.factor(Labels[Train_Idx[[i]]])
targetCellTypes = as.factor(Labels[Test_Idx[[i]]])
start_time <- Sys.time()
# 2. Unify sets, excluding low expressed genes
source_n_cells_counts = apply(ds1, 2, function(x) { sum(x > 0) } )
target_n_cells_counts = apply(ds2, 2, function(x) { sum(x > 0) } )
common_genes = intersect( colnames(ds1)[source_n_cells_counts>10],
colnames(ds2)[target_n_cells_counts>10])
remove(source_n_cells_counts, target_n_cells_counts)
ds1 = ds1[, colnames(ds1) %in% common_genes]
ds2 = ds2[, colnames(ds2) %in% common_genes]
ds = rbind(ds1[,common_genes], ds2[,common_genes])
isSource = c(rep(TRUE,nrow(ds1)), rep(FALSE,nrow(ds2)))
remove(ds1, ds2)
# 3. Highest mean in both source and target
topFeaturesAvg = colnames(ds)[order(apply(ds, 2, mean), decreasing = T)]
end_time <- Sys.time()
Training_Time_Castle[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
start_time <- Sys.time()
# for each cell - what is the most probable classification?
L = length(levels(sourceCellTypes))
targetClassification = as.data.frame(matrix(rep(0,L*sum(!isSource)), nrow=L), row.names = levels(sourceCellTypes))
for (cellType in levels(sourceCellTypes)) {
inSourceCellType = as.factor(ifelse(sourceCellTypes == cellType, cellType, paste0("NOT",cellType)))
# 4. Highest mutual information in source
topFeaturesMi = names(sort(apply(ds[isSource,],2,function(x) { compare(cut(x,breaks=BREAKS),inSourceCellType,method = "nmi") }), decreasing = T))
# 5. Top n genes that appear in both mi and avg
selectedFeatures = union(head(topFeaturesAvg, nFeatures) , head(topFeaturesMi, nFeatures) )
# 6. remove correlated features
tmp = cor(ds[,selectedFeatures], method = "pearson")
tmp[!lower.tri(tmp)] = 0
selectedFeatures = selectedFeatures[apply(tmp,2,function(x) any(x < 0.9))]
remove(tmp)
# 7,8. Convert data from continous to binned dummy vars
# break datasets to bins
dsBins = apply(ds[, selectedFeatures], 2, cut, breaks= BREAKS)
# use only bins with more than one value
nUniq = apply(dsBins, 2, function(x) { length(unique(x)) })
# convert to dummy vars
ds0 = model.matrix(~ . , as.data.frame(dsBins[,nUniq>1]))
remove(dsBins, nUniq)
cat(paste0("
Classifier for ",cellType,"
"))
inTypeSource = sourceCellTypes == cellType
# 9. Classify
xg=xgboost(data=ds0[isSource,] ,
label=inTypeSource,
objective="binary:logistic",
eta=0.7 , nthread=1, nround=20, verbose=0,
gamma=0.001, max_depth=5, min_child_weight=10)
# 10. Predict
inTypeProb = predict(xg, ds0[!isSource, ])
targetClassification[cellType,] = inTypeProb
}
end_time <- Sys.time()
Testing_Time_Castle[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
True_Labels_Castle[i] <- list(Labels[Test_Idx[[i]]])
Pred_Labels_Castle[i] <- list(rownames(targetClassification)[apply(targetClassification,2,which.max)])
}
True_Labels_Castle <- as.vector(unlist(True_Labels_Castle))
Pred_Labels_Castle <- as.vector(unlist(Pred_Labels_Castle))
Training_Time_Castle <- as.vector(unlist(Training_Time_Castle))
Testing_Time_Castle <- as.vector(unlist(Testing_Time_Castle))
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
write.csv(True_Labels_Castle,paste('True_Labels_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)
write.csv(Pred_Labels_Castle,paste('Pred_Labels_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)
write.csv(Training_Time_Castle,paste('Training_Time_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)
write.csv(Testing_Time_Castle,paste('Testing_Time_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)
}
else{
write.csv(True_Labels_Castle,'True_Labels_CaSTLe.csv',row.names = FALSE)
write.csv(Pred_Labels_Castle,'Pred_Labels_CaSTLe.csv',row.names = FALSE)
write.csv(Training_Time_Castle,'Training_Time_CaSTLe.csv',row.names = FALSE)
write.csv(Testing_Time_Castle,'Testing_Time_CaSTLe.csv',row.names = FALSE)
}
}
================================================
FILE: Scripts/run_Cell_BLAST.py
================================================
import os
import time as tm
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import tensorflow as tf
tf.logging.set_verbosity(0)
import Cell_BLAST as cb
import numpy as np
from numpy import genfromtxt as gft
import rpy2.robjects as robjects
def run_Cell_BLAST(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run Cell_BLAST
Wrapper script to run Cell_BLAST on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
# read the data and labels
data_old = cb.data.ExprDataSet.read_table(DataPath,orientation="cg", sep=",", index_col = 0, header = 0, sparsify = True).normalize()
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
data = cb.data.ExprDataSet(data_old.exprs[tokeep],data_old.obs.iloc[tokeep],data_old.var,data_old.uns)
labels = gft(LabelsPath, dtype = "str", skip_header = 1, delimiter = ",", usecols = col)
labels = labels[tokeep]
os.chdir(OutputDir)
truelab = []
pred = []
tr_time = []
ts_time = []
for i in range(np.squeeze(nfolds)):
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
train=data[train_ind_i,:]
test=data[test_ind_i,:]
y_train = labels[train_ind_i]
y_test = labels[test_ind_i]
if (NumGenes > 0):
feat_to_use = features.iloc[0:NumGenes,i]
train = train[:,feat_to_use]
test = test[:,feat_to_use]
train.obs['cell_type'] = y_train
start = tm.time()
# reduce dimensions
num_epoch = 50
models = []
for j in range(4):
models.append(cb.directi.fit_DIRECTi(train, epoch=num_epoch, patience=10, random_seed = j, path="%d" % j))
# train model
blast = cb.blast.BLAST(models, train).build_empirical()
tr_time.append(tm.time()-start)
# predict labels
start = tm.time()
test_pred = blast.query(test).annotate('cell_type')
ts_time.append(tm.time()-start)
truelab.extend(y_test)
pred.extend(test_pred.values)
#write results
truelab = pd.DataFrame(truelab)
pred = pd.DataFrame(pred)
tr_time = pd.DataFrame(tr_time)
ts_time = pd.DataFrame(ts_time)
if (NumGenes == 0):
truelab.to_csv("Cell_BLAST_True_Labels.csv", index = False)
pred.to_csv("Cell_BLAST_Pred_Labels.csv", index = False)
tr_time.to_csv("Cell_BLAST_Training_Time.csv", index = False)
ts_time.to_csv("Cell_BLAST_Testing_Time.csv", index = False)
else:
truelab.to_csv("Cell_BLAST_" + str(NumGenes) + "_True_Labels.csv", index = False)
pred.to_csv("Cell_BLAST_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
tr_time.to_csv("Cell_BLAST_" + str(NumGenes) + "_Training_Time.csv", index = False)
ts_time.to_csv("Cell_BLAST_" + str(NumGenes) + "_Testing_Time.csv", index = False)
================================================
FILE: Scripts/run_DigitalCellSorter.py
================================================
import numpy as np
import pandas as pd
import scripts.DigitalCellSorter as DigitalCellSorter
import os
import time as tm
import rpy2.robjects as robjects
def run_DigitalCellSorter(DataPath, LabelsPath, CV_RDataPath, GeneListPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run DigitalCellSorter
Wrapper script to run DigitalCellSorter on a benchmark dataset using a predefined genelist,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
GeneListPath : Data file path to the genest.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
data = data.iloc[tokeep]
truelab = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
truelab = truelab.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
feat_to_use = features.iloc[0:NumGenes,0]
data = data.iloc[:,feat_to_use]
data = data.transpose()
# number of different cell types in the data?
n_clusters = 8
AvailableCPUsCount = 1
N_samples_for_distribution = 10000
start = tm.time()
pred = DigitalCellSorter.DigitalCellSorter().Process(data, 'DigitalCellSorter_Zhang',
saveDir = OutputDir,
geneListFileName = GeneListPath,
N_samples_for_distribution = N_samples_for_distribution,
AvailableCPUsCount = AvailableCPUsCount,
clusterIndex=None,
clusterName=None,
n_clusters=n_clusters)
runtime = tm.time() - start
os.chdir(OutputDir)
results = pd.read_excel('DigitalCellSorter_Zhang_voting.xlsx',header=0,index_col=None, usecols=[11])
prediction = np.zeros(np.shape(pred), dtype='>U10')
for i in range(len(results)):
prediction[np.where(pred == i)] = results.values[i]
prediction = pd.DataFrame(prediction)
if (NumGenes == 0):
truelab.to_csv("DigitalCellSorter_True_Labels.csv", index = False)
prediction.to_csv("DigitalCellSorter_Pred_Labels.csv", index = False)
with open("DigitalCellSorter_Total_Time.csv", 'w') as f:
f.write("%f\n" % runtime)
else:
truelab.to_csv("DigitalCellSorter_" + str(NumGenes) + "_True_Labels.csv", index = False)
prediction.to_csv("DigitalCellSorter_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
with open("DigitalCellSorter_" + str(NumGenes) + "_Total_Time.csv", 'w') as f:
f.write("%f\n" % runtime)
================================================
FILE: Scripts/run_Garnett_CV.R
================================================
run_Garnett_CV <- function(DataPath, LabelsPath, CV_RDataPath, GenesPath, MarkerPath, OutputDir, Human){
"
run Garnett
Wrapper script to run Garnett on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
GenesPath : Path to the file with the genenames
MarkerPath : Path to the file with marker genes
OutputDir : Output directory defining the path of the exported file.
Human : boolean indicating whether the dataset is human (TRUE) or mouse (FALSE)
"
# load needed libraries
library(garnett)
if (Human) {
library(org.Hs.eg.db)
} else {
library(org.Mm.eg.db)
}
# load the CVFile
load(CV_RDataPath)
# read the labels
labels <- as.matrix(read.csv(LabelsPath))
labels <- as.vector(labels[,col_Index])
labels <- labels[Cells_to_Keep]
# read the data
mat <- read.table(DataPath, sep = ",")
data <- mat[-1,-1]
data <- data[Cells_to_Keep,]
data <- t(data) #ensure that the genes are rows, and the cells are columns
cells <- mat[-1,1]
cells <- cells[Cells_to_Keep]
# read the genefile
fdata <- read.table(GenesPath)
names(fdata) <- 'gene_short_name'
row.names(fdata) <- fdata$gene_short_name
fd <- new("AnnotatedDataFrame", data = fdata)
true_labels <- list()
pred_labels <- list()
train_time <- list()
test_time <- list()
for (i in c(1:n_folds)){
lab_train = labels[Train_Idx[[i]]]
lab_test = labels[Test_Idx[[i]]]
train = data[,Train_Idx[[i]]]
test = data[,Test_Idx[[i]]]
cells_train = cells[Train_Idx[[i]]]
cells_test = cells[Test_Idx[[i]]]
pdata_train = data.frame(cells_train)
pdata_test = data.frame(cells_test)
row.names(train) <- row.names(fdata)
row.names(test) <- row.names(fdata)
colnames(train) <- row.names(pdata_train)
colnames(test) <- row.names(pdata_test)
pd_train <- new("AnnotatedDataFrame", data = pdata_train)
pd_test <- new("AnnotatedDataFrame", data = pdata_test)
pbmc_cds_train <- newCellDataSet(as(train, "dgCMatrix"), phenoData = pd_train, featureData = fd)
pbmc_cds_test <- newCellDataSet(as(test, "dgCMatrix"), phenoData = pd_test, featureData = fd)
pbmc_cds_train <- estimateSizeFactors(pbmc_cds_train)
pbmc_cds_test <- estimateSizeFactors(pbmc_cds_test)
# training
start_train <- Sys.time()
if (Human){
pbmc_classifier <- train_cell_classifier(cds = pbmc_cds_train,
marker_file = MarkerPath,
db=org.Hs.eg.db,
cds_gene_id_type = "SYMBOL",
num_unknown = 50,
marker_file_gene_id_type = "SYMBOL")
} else {
pbmc_classifier <- train_cell_classifier(cds = pbmc_cds_train,
marker_file = MarkerPath,
db=org.Mm.eg.db,
cds_gene_id_type = "SYMBOL",
num_unknown = 50,
marker_file_gene_id_type = "SYMBOL")
}
end_train <- Sys.time()
train_time[i] <- as.numeric(end_train - start_train)
# testing
start_test <- Sys.time()
if (Human) {
pbmc_cds_test <- classify_cells(pbmc_cds_test,
pbmc_classifier,
db = org.Hs.eg.db,
cluster_extend = TRUE,
cds_gene_id_type = "SYMBOL")
} else {
pbmc_cds_test <- classify_cells(pbmc_cds_test,
pbmc_classifier,
db = org.Mm.eg.db,
cluster_extend = TRUE,
cds_gene_id_type = "SYMBOL")
}
end_test <- Sys.time()
test_time[i] <- as.numeric(end_test - start_test)
true_labels[i] <- list(lab_test)
pred_labels[i] <- list(pData(pbmc_cds_test)$cluster_ext_type)
}
true_labels <- as.vector(unlist(true_labels))
pred_labels <- as.vector(unlist(pred_labels))
train_time <- as.vector(unlist(train_time))
test_time <- as.vector(unlist(test_time))
setwd(OutputDir)
write.csv(train_time,'Garnett_CV_Testing_Time.csv',row.names = FALSE)
write.csv(test_time,'Garnett_CV_Training_Time.csv',row.names = FALSE)
write.csv(true_labels, 'Garnett_CV_True_Labels.csv', row.names = FALSE)
write.csv(pred_labels, 'Garnett_CV_Pred_Labels.csv', row.names = FALSE)
}
================================================
FILE: Scripts/run_Garnett_Pretrained.R
================================================
run_Garnett_Pretrained <- function(DataPath, LabelsPath, GenesPath, CV_RDataPath, ClassifierPath, OutputDir, Human){
"
run Garnett
Wrapper script to run Garnett on a benchmark dataset with a pretrained classifier,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
GenesPath : Path to the file with the genenames
ClassifierPath : Path to the pretrained classifier
OutputDir : Output directory defining the path of the exported file.
Human : boolean indicating whether the dataset is human (TRUE) or mouse (FALSE)
"
# load needed libraries
library(garnett)
if (Human) {
library(org.Hs.eg.db)
} else {
library(org.Mm.eg.db)
}
# load data, genes, and marker file
load(CV_RDataPath)
load(ClassifierPath)
labels <- as.matrix(read.csv(LabelsPath))
labels <- labels[Cells_to_Keep]
mat <- read.table(DataPath, sep = ",")
data <- mat[-1,-1]
data <- data[Cells_to_Keep,]
data <- t(data) #ensure that the genes are rows, and the cells are columns
barcodes <- mat[-1,1]
pdata = data.frame(barcodes)
fdata <- read.table(GenesPath)
names(fdata) <- 'gene_short_name'
row.names(fdata) <- fdata$gene_short_name
row.names(data) <- row.names(fdata)
colnames(data) <- row.names(pdata)
pd <- new("AnnotatedDataFrame", data = pdata)
fd <- new("AnnotatedDataFrame", data = fdata)
pbmc_cds <- newCellDataSet(as(data, "dgCMatrix"),
phenoData = pd,
featureData = fd)
start_time <- Sys.time()
pbmc_cds <- estimateSizeFactors(pbmc_cds)
if (Human){
pbmc_cds <- classify_cells(pbmc_cds, hsPBMC, db = org.Hs.eg.db, cluster_extend = TRUE, cds_gene_id_type = "SYMBOL")
} else {
pbmc_cds <- classify_cells(pbmc_cds, mmLung, db = org.Mm.eg.db, cluster_extend = TRUE, cds_gene_id_type = "SYMBOL")
}
end_time <- Sys.time()
test_time <- as.numeric(end_time - start_time)
setwd(OutputDir)
write.table(pData(pbmc_cds)$cluster_ext_type, file = "Garnett_Pred_Labels.csv", append = FALSE, quote = TRUE, sep = "\t",
eol = "\n", na = "NA", dec = ".", row.names = FALSE,
qmethod = c("escape", "double"),
fileEncoding = "")
write.csv(labels,"Garnett_Pretrained_True_Labels.csv", row.names = FALSE)
write.csv(test_time,'Garnett_Pretrained_Testing_Time.csv',row.names = FALSE)
}
================================================
FILE: Scripts/run_LAmbDA.py
================================================
# -*- coding: utf-8 -*-
"""
Created on Thu May 23 13:51:15 2019
@author: Lieke
"""
import os
import numpy as np
import pandas as pd
import time as tm
import rpy2.robjects as robjects
import tensorflow as tf
import math
import scipy.io as sio
import optunity as opt
from tensorflow.contrib.tensor_forest.python import tensor_forest
from tensorflow.python.ops import resources
def run_LAmbDA(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run LAmbDA classifier
Wrapper script to run LAmbDA on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
labels = labels.iloc[tokeep]
data = data.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
# folder with results
os.chdir(OutputDir)
tr_time=[]
ts_time=[]
truelab = np.zeros([len(labels),1],dtype = int)
predlab = np.zeros([len(labels),1],dtype = int)
for i in range(np.squeeze(nfolds)):
global X, Y, Gnp, Dnp, train, test, prt, cv
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
X = np.array(data)
if (NumGenes > 0):
X = np.log2(X/10+1)
feat_to_use = features.iloc[0:NumGenes,i]
X = X[:,feat_to_use]
else:
X = np.log2(np.transpose(select_feats(np.transpose(X),0.5,80))/10+1)
uniq = np.unique(labels)
Y = np.zeros([len(labels),len(uniq)],int)
for j in range(len(uniq)):
Y[np.where(labels == uniq[j])[0],j] = 1
Y = np.array(Y)
Gnp = np.zeros([len(uniq),len(uniq)],int)
np.fill_diagonal(Gnp,1)
Gnp = np.array(Gnp)
Dnp = np.ones([len(uniq),1],int)
Dnp = np.array(Dnp)
train_samp = int(np.floor(0.75*len(train_ind_i)))
test_samp = len(train_ind_i) - train_samp
perm = np.random.permutation(len(train_ind_i))
train = perm[0:train_samp]
test = perm[train_samp:test_samp+1]
while(np.sum(np.sum(Y[train,:],0)<5)>0):
perm = np.random.permutation(X.shape[0])
train = perm[0:train_samp+1]
test = perm[train_samp+1:train_samp+test_samp+1]
cv = i
optunity_it = 0
prt = False
opt_params = None
start=tm.time()
opt_params, _, _ = opt.minimize(run_LAmbDA2,solver_name='sobol', gamma=[0.8,1.2], delta=[0.05,0.95], tau=[10.0,11.0], prc_cut=[20,50], bs_prc=[0.2,0.6], num_trees=[10,200], max_nodes=[100,1000], num_evals=50)
tr_time.append(tm.time()-start)
print("Finished training!")
prt = True
train = train_ind_i
test = test_ind_i
start=tm.time()
err = run_LAmbDA2(opt_params['gamma'], opt_params['delta'], opt_params['tau'], opt_params['prc_cut'], opt_params['bs_prc'], opt_params['num_trees'], opt_params['max_nodes'])
ts_time.append(tm.time()-start)
tf.reset_default_graph();
predfile = 'preds_cv' + str(cv) + '.mat'
truefile = 'truth_cv' + str(cv) + '.mat'
pred = sio.loadmat(predfile)
truth = sio.loadmat(truefile)
pred = pred['preds']
truth = truth['labels']
pred_ind = np.argmax(pred,axis=1)
truth_ind = np.argmax(truth,axis=1)
predlab[test_ind_i,0] = pred_ind
truelab[test_ind_i,0] = truth_ind
truelab = pd.DataFrame(truelab)
predlab = pd.DataFrame(predlab)
tr_time = pd.DataFrame(tr_time)
ts_time = pd.DataFrame(ts_time)
if (NumGenes == 0):
truelab.to_csv("LAmbDA_True_Labels.csv", index = False)
predlab.to_csv("LAmbDA_Pred_Labels.csv", index = False)
tr_time.to_csv("LAmbDA_Training_Time.csv", index = False)
ts_time.to_csv("LAmbDA_Testing_Time.csv", index = False)
else:
truelab.to_csv("LAmbDA_" + str(NumGenes) + "_True_Labels.csv", index = False)
predlab.to_csv("LAmbDA_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
tr_time.to_csv("LAmbDA_" + str(NumGenes) + "_Training_Time.csv", index = False)
ts_time.to_csv("LAmbDA_" + str(NumGenes) + "_Testing_Time.csv", index = False)
##### Functions copied from LAmbDA's Github
def wt_cutoff(colnum,cutoff,Gtmp,gamma):
rowsums = np.sum(Gtmp,axis=1);
return(math.ceil(cutoff*(math.log((max(rowsums)/rowsums[colnum])+1,2)**gamma)))
def resample(prc_cut,Y,Gtmp,train,gamma):
add = list()
rem = list()
colsums = np.sum(Y[train,:],axis=0);
cutoff = math.ceil(np.percentile(colsums,prc_cut));
for i in range(len(colsums)):
if colsums[i] == 0:
pass
elif colsums[i] < wt_cutoff(i,cutoff,Gtmp,gamma):
idx = np.squeeze(np.array(np.where(Y[train,i]>=1)));
choice = np.random.choice(train[idx],int(wt_cutoff(i,cutoff,Gtmp,gamma)-colsums[i]))
add = add + choice.tolist();
elif colsums[i] == wt_cutoff(i,cutoff,Gtmp,gamma):
pass
else:
idx = np.squeeze(np.array(np.where(Y[train,i]>=1)));
choice = np.random.choice(train[idx],int(colsums[i]-wt_cutoff(i,cutoff,Gtmp,gamma)),replace=False)
rem = rem + choice.tolist()
return np.concatenate((list([val for val in train if val not in rem]),add));
def select_feats(Xtmp,num_zero_prc_cut,var_prc_cut):
#*********************************************************************
# remove features with many zeros
num_feat_zeros = np.sum(Xtmp==0,axis=1);
Xtmp = Xtmp[num_feat_zerosnp.percentile(feat_vars,var_prc_cut),:]
return(Xtmp)
def get_yn(predict,ys,delta,tau,output_feats):
D = tf.cast(Dnp, tf.float32);
G = tf.cast(Gnp, tf.float32);
ys = tf.cast(ys, tf.float32);
#print("start")
Cm = tf.matmul(tf.transpose(tf.matmul(ys,D)),predict+0.1)/tf.reshape(tf.reduce_sum(tf.transpose(tf.matmul(ys,D)),1),(-1,1));
#print("1")
mCm = tf.reshape(tf.reduce_mean(tf.cast(tf.matmul(tf.transpose(D),G)>0,tf.float32)*Cm,1),(-1,1));
#print("2")
yw = tf.multiply(predict+0.1,tf.matmul(tf.matmul(ys,D),tf.pow(mCm/Cm,tau)));
#print("3")
ye = tf.multiply(tf.matmul(ys,G),yw);
#print("4")
yt = tf.matmul(ys,tf.matmul(tf.transpose(ys),ye));
#print("5")
ya = (delta*yt)+((1-delta)*ye)
#print("6")
yn = tf.cast(tf.one_hot(tf.argmax(ya,axis=1),output_feats), dtype=tf.float32)
#print("7")
return(yn)
def get_yi(rowsums,G2,ys):
G2 = tf.cast(G2, tf.float32);
ys = tf.cast(ys, tf.float32);
yi = tf.cast(tf.matmul(ys,G2), dtype=tf.float32);
return(yi)
def run_LAmbDA2(gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes):
global X, Y, Gnp, Dnp, train, test, prt, cv
D = tf.cast(Dnp, tf.float32);
G = tf.cast(Gnp, tf.float32);
#optunity_it = optunity_it+1;
num_trees = int(num_trees);
max_nodes = int(max_nodes);
prc_cut = int(np.ceil(prc_cut));
print("gamma=%.4f, delta=%.4f, tau=%.4f, prc_cut=%i, bs_prc=%.4f, num_trees=%i, max_nodes=%i" % (gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes))
input_feats = X.shape[1];
num_labls = G.shape.as_list();
output_feats = num_labls[1];
#print(output_feats)
num_labls = num_labls[0];
rowsums = np.sum(Gnp,axis=1);
train2 = resample(prc_cut, Y, Gnp, train, gamma); # Bug??
bs = int(np.ceil(bs_prc*train2.size))
xs = tf.placeholder(tf.float32, [None,input_feats])
#ys = tf.placeholder(tf.float32, [None,num_labls])
yin = tf.placeholder(tf.int32, [None])
print("Vars loaded xs and ys created")
hparams = tensor_forest.ForestHParams(num_classes=output_feats,
num_features=input_feats,
num_trees=num_trees,
max_nodes=max_nodes).fill()
print("Tensor forest hparams created")
forest_graph = tensor_forest.RandomForestGraphs(hparams)
print("Tensor forest graph created")
train_op = forest_graph.training_graph(xs, yin)
loss_op = forest_graph.training_loss(xs, yin)
print("Loss and train ops created")
predict, _, _ = forest_graph.inference_graph(xs)
print("Tensor forest variables created through predict")
accuracy_op = tf.reduce_mean(tf.reduce_sum(tf.square(tf.one_hot(yin,output_feats)-predict),reduction_indices=[1]))
print(tf.reduce_sum(tf.square(tf.one_hot(yin,output_feats)-predict),reduction_indices=[1]))
#predict = tf.one_hot(pred);
print("Lambda specific variables created")
# Creating training and testing steps
G2 = np.copy(Gnp);
G2[rowsums>1,:] = 0;
YI = np.matmul(Y,G2);
YIrs = np.sum(YI,axis=1);
trainI = train2[np.in1d(train2,np.where(YIrs==1))];
print("data type trainI,",trainI.dtype)
testI = test[np.in1d(test,np.where(YIrs==1))];
print("trainI testI created")
#init_vars=tf.global_variables_initializer()
init_vars = tf.group(tf.global_variables_initializer(),
resources.initialize_resources(resources.shared_resources()))
sess = tf.Session()
sess.run(init_vars)
print("Session started")
#beep = sess.run(predict,feed_dict={xs:X[1:100,:]});
#beep = sess.run(predict,feed_dict={xs:X[train2[0:bs],:]});
tensor_trainI = {xs: X[trainI, :], yin: sess.run(tf.argmax(get_yi(rowsums,G2,Y[trainI, :]),axis=1))}
print("tensor_trainI made")
tensor_testI = {xs: X[testI, :], yin: sess.run(tf.argmax(get_yi(rowsums,G2,Y[testI, :]),axis=1))}
print("tensor_testI made")
tensor_train = {xs: X[train2[0:bs], :], yin: sess.run(tf.argmax(get_yn(sess.run(predict,feed_dict={xs:X[train2[0:bs],:]}),Y[train2[0:bs], :],delta,tau,output_feats),axis=1))}
print("tensor_train made")
tensor_test = {xs: X[test, :], yin: sess.run(tf.argmax(get_yn(sess.run(predict,feed_dict={xs:X[test,:]}),Y[test, :],delta,tau,output_feats),axis=1))}
print("tensor_test made")
#**********************************
#print("Loss and training steps created with sample tensors")
# Setting params and initializing
print("Beginning iterations")
# Starting training iterations
print(X.shape)
for i in range(1,101):
if i < 50:
sess.run(train_op, feed_dict=tensor_trainI)
#print("ran train op")
if i % 10 == 0:
print(str(sess.run(accuracy_op, feed_dict=tensor_trainI)) + ' ' + str(sess.run(accuracy_op, feed_dict=tensor_testI)))
else:
sess.run(train_op, feed_dict=tensor_train)
if i % 10 == 0:
print(str(sess.run(accuracy_op, feed_dict=tensor_train)) + ' ' + str(sess.run(accuracy_op, feed_dict=tensor_test)))
elif i % 10 == 0:
np.random_shuffle(train2);
tensor_train = {xs: X[train2[0:bs], :], yin: sess.run(get_yn(sess.run(predict,feed_dict={xs:X[train2[0:bs],:]}),Y[train2[0:bs], :],delta,tau,output_feats))}
if prt:
blah = sess.run(predict, feed_dict=tensor_test);
sio.savemat('preds_cv' + str(cv) + '.mat', {'preds': blah});
sio.savemat('truth_cv' + str(cv) + '.mat', {'labels': Y[test, :]});
acc = sess.run(accuracy_op, feed_dict=tensor_test)
print("loss1=%.4f, gamma=%.4f, delta=%.4f, tau=%.4f, prc_cut=%i, bs_prc=%.4f, num_trees=%i, max_nodes=%i" % (acc, gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes))
tf.reset_default_graph();
return(acc)
================================================
FILE: Scripts/run_LDA.py
================================================
import os
import numpy as np
import pandas as pd
import time as tm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import rpy2.robjects as robjects
def run_LDA(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run baseline classifier: LDA
Wrapper script to run LDA classifier on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
labels = labels.iloc[tokeep]
data = data.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
# folder with results
os.chdir(OutputDir)
# normalize data
data = np.log1p(data)
Classifier = LinearDiscriminantAnalysis()
tr_time=[]
ts_time=[]
truelab = []
pred = []
for i in range(np.squeeze(nfolds)):
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
train=data.iloc[train_ind_i]
test=data.iloc[test_ind_i]
y_train=labels.iloc[train_ind_i]
y_test=labels.iloc[test_ind_i]
if (NumGenes > 0):
feat_to_use = features.iloc[0:NumGenes,i]
train = train.iloc[:,feat_to_use]
test = test.iloc[:,feat_to_use]
start=tm.time()
Classifier.fit(train, y_train)
tr_time.append(tm.time()-start)
start=tm.time()
predicted = Classifier.predict(test)
ts_time.append(tm.time()-start)
truelab.extend(y_test.values)
pred.extend(predicted)
truelab = pd.DataFrame(truelab)
pred = pd.DataFrame(pred)
tr_time = pd.DataFrame(tr_time)
ts_time = pd.DataFrame(ts_time)
if (NumGenes == 0):
truelab.to_csv("LDA_True_Labels.csv", index = False)
pred.to_csv("LDA_Pred_Labels.csv", index = False)
tr_time.to_csv("LDA_Training_Time.csv", index = False)
ts_time.to_csv("LDA_Testing_Time.csv", index = False)
else:
truelab.to_csv("LDA_" + str(NumGenes) + "_True_Labels.csv", index = False)
pred.to_csv("LDA_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
tr_time.to_csv("LDA_" + str(NumGenes) + "_Training_Time.csv", index = False)
ts_time.to_csv("LDA_" + str(NumGenes) + "_Testing_Time.csv", index = False)
================================================
FILE: Scripts/run_LDA_rejection.py
================================================
import os
import numpy as np
import pandas as pd
import time as tm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import rpy2.robjects as robjects
def run_LDA_rejection(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0, Threshold = 0.7):
'''
run baseline classifier: LDA
Wrapper script to run LDA classifier on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
Threshold : Threshold used when rejecting the genes, default is 0.7.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
labels = labels.iloc[tokeep]
data = data.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
# folder with results
os.chdir(OutputDir)
# normalize data
data = np.log1p(data)
Classifier = LinearDiscriminantAnalysis()
tr_time=[]
ts_time=[]
truelab = []
pred = []
for i in range(np.squeeze(nfolds)):
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
train=data.iloc[train_ind_i]
test=data.iloc[test_ind_i]
y_train=labels.iloc[train_ind_i]
y_test=labels.iloc[test_ind_i]
if (NumGenes > 0):
feat_to_use = features.iloc[0:NumGenes,i]
train = train.iloc[:,feat_to_use]
test = test.iloc[:,feat_to_use]
start=tm.time()
Classifier.fit(train, y_train)
tr_time.append(tm.time()-start)
start=tm.time()
predicted = Classifier.predict(test)
prob = np.max(Classifier.predict_proba(test), axis = 1)
unlabeled = np.where(prob < Threshold)
predicted[unlabeled] = 'Unknown'
ts_time.append(tm.time()-start)
truelab.extend(y_test.values)
pred.extend(predicted)
truelab = pd.DataFrame(truelab)
pred = pd.DataFrame(pred)
tr_time = pd.DataFrame(tr_time)
ts_time = pd.DataFrame(ts_time)
if (NumGenes == 0):
truelab.to_csv("LDA_True_Labels.csv", index = False)
pred.to_csv("LDA_Pred_Labels.csv", index = False)
tr_time.to_csv("LDA_Training_Time.csv", index = False)
ts_time.to_csv("LDA_Testing_Time.csv", index = False)
else:
truelab.to_csv("LDA_" + str(NumGenes) + "_True_Labels.csv", index = False)
pred.to_csv("LDA_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
tr_time.to_csv("LDA_" + str(NumGenes) + "_Training_Time.csv", index = False)
ts_time.to_csv("LDA_" + str(NumGenes) + "_Testing_Time.csv", index = False)
================================================
FILE: Scripts/run_NMC.py
================================================
import os
import numpy as np
import pandas as pd
import time as tm
from sklearn.neighbors import NearestCentroid
import rpy2.robjects as robjects
def run_NMC(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run baseline classifier: NMC
Wrapper script to run a NMC classifier on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
labels = labels.iloc[tokeep]
data = data.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
# folder with results
os.chdir(OutputDir)
# normalize data
data = np.log1p(data)
Classifier = NearestCentroid()
tr_time=[]
ts_time=[]
truelab = []
pred = []
for i in range(np.squeeze(nfolds)):
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
train=data.iloc[train_ind_i]
test=data.iloc[test_ind_i]
y_train=labels.iloc[train_ind_i]
y_test=labels.iloc[test_ind_i]
if (NumGenes > 0):
feat_to_use = features.iloc[0:NumGenes,i]
train = train.iloc[:,feat_to_use]
test = test.iloc[:,feat_to_use]
start=tm.time()
Classifier.fit(train, y_train)
tr_time.append(tm.time()-start)
start=tm.time()
predicted = Classifier.predict(test)
ts_time.append(tm.time()-start)
truelab.extend(y_test.values)
pred.extend(predicted)
truelab = pd.DataFrame(truelab)
pred = pd.DataFrame(pred)
tr_time = pd.DataFrame(tr_time)
ts_time = pd.DataFrame(ts_time)
if (NumGenes == 0):
truelab.to_csv("NMC_True_Labels.csv", index = False)
pred.to_csv("NMC_Pred_Labels.csv", index = False)
tr_time.to_csv("NMC_Training_Time.csv", index = False)
ts_time.to_csv("NMC_Testing_Time.csv", index = False)
else:
truelab.to_csv("NMC_" + str(NumGenes) + "_True_Labels.csv", index = False)
pred.to_csv("NMC_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
tr_time.to_csv("NMC_" + str(NumGenes) + "_Training_Time.csv", index = False)
ts_time.to_csv("NMC_" + str(NumGenes) + "_Testing_Time.csv", index = False)
================================================
FILE: Scripts/run_RF.py
================================================
import os
import numpy as np
import pandas as pd
import time as tm
from sklearn.ensemble import RandomForestClassifier
import rpy2.robjects as robjects
def run_RF(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run baseline classifier: RF
Wrapper script to run a RF classifier with 50 trees on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
labels = labels.iloc[tokeep]
data = data.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
# folder with results
os.chdir(OutputDir)
# normalize data
data = np.log1p(data)
Classifier = RandomForestClassifier(n_estimators = 50)
tr_time=[]
ts_time=[]
truelab = []
pred = []
for i in range(np.squeeze(nfolds)):
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
train=data.iloc[train_ind_i]
test=data.iloc[test_ind_i]
y_train=labels.iloc[train_ind_i]
y_test=labels.iloc[test_ind_i]
if (NumGenes > 0):
feat_to_use = features.iloc[0:NumGenes,i]
train = train.iloc[:,feat_to_use]
test = test.iloc[:,feat_to_use]
start=tm.time()
Classifier.fit(train, y_train)
tr_time.append(tm.time()-start)
start=tm.time()
predicted = Classifier.predict(test)
ts_time.append(tm.time()-start)
truelab.extend(y_test.values)
pred.extend(predicted)
truelab = pd.DataFrame(truelab)
pred = pd.DataFrame(pred)
tr_time = pd.DataFrame(tr_time)
ts_time = pd.DataFrame(ts_time)
if (NumGenes == 0):
truelab.to_csv("RF_True_Labels.csv", index = False)
pred.to_csv("RF_Pred_Labels.csv", index = False)
tr_time.to_csv("RF_Training_Time.csv", index = False)
ts_time.to_csv("RF_Testing_Time.csv", index = False)
else:
truelab.to_csv("RF_" + str(NumGenes) + "_True_Labels.csv", index = False)
pred.to_csv("RF_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
tr_time.to_csv("RF_" + str(NumGenes) + "_Training_Time.csv", index = False)
ts_time.to_csv("RF_" + str(NumGenes) + "_Testing_Time.csv", index = False)
================================================
FILE: Scripts/run_SCINA.R
================================================
run_SCINA<-function(DataPath,LabelsPath,GeneSigPath,OutputDir){
"
run SCINA
Wrapper script to run SCINA on a benchmark dataset,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
GeneSigPath : Cell type marker genes file path (.csv)
OutputDir : Output directory defining the path of the exported file.
"
Data <- read.csv(DataPath,row.names = 1)
Labels <- as.vector(as.matrix(read.csv(LabelsPath)))
Data <- Data[is.element(Labels,c('CD14+ Monocyte','CD19+ B','CD56+ NK')),]
Labels <- Labels[is.element(Labels,c('CD14+ Monocyte','CD19+ B','CD56+ NK'))]
Labels[Labels == 'CD14+ Monocyte'] <- 'CD14_Monocyte'
Labels[Labels == 'CD19+ B'] <- 'CD19_B'
Labels[Labels == 'CD56+ NK'] <- 'CD56_NK'
#############################################################################
# SCINA #
#############################################################################
library(SCINA)
Signature_Genes <- preprocess.signatures(GeneSigPath)
True_Labels_SCINA <- list()
Pred_Labels_SCINA <- list()
Total_Time_SCINA <- list()
library(preprocessCore)
Data = t(as.matrix(Data))
Data=log(Data+1)
Data[]=normalize.quantiles(Data)
start_time <- Sys.time()
results = SCINA(Data, Signature_Genes)
end_time <- Sys.time()
True_Labels_SCINA <- Labels
Pred_Labels_SCINA <- results$cell_labels
Total_Time_SCINA <- as.numeric(difftime(end_time,start_time,units = 'secs'))
setwd(OutputDir)
write.csv(True_Labels_SCINA,'SCINA_True_Labels.csv',row.names = FALSE)
write.csv(Pred_Labels_SCINA,'SCINA_Pred_Labels.csv',row.names = FALSE)
write.csv(Total_Time_SCINA,'SCINA_Total_Time.csv',row.names = FALSE)
}
================================================
FILE: Scripts/run_SVM.py
================================================
import os
import numpy as np
import pandas as pd
import time as tm
from sklearn.svm import LinearSVC
import rpy2.robjects as robjects
def run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run baseline classifier: SVM
Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
labels = labels.iloc[tokeep]
data = data.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
# folder with results
os.chdir(OutputDir)
# normalize data
data = np.log1p(data)
Classifier = LinearSVC()
tr_time=[]
ts_time=[]
truelab = []
pred = []
for i in range(np.squeeze(nfolds)):
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
train=data.iloc[train_ind_i]
test=data.iloc[test_ind_i]
y_train=labels.iloc[train_ind_i]
y_test=labels.iloc[test_ind_i]
if (NumGenes > 0):
feat_to_use = features.iloc[0:NumGenes,i]
train = train.iloc[:,feat_to_use]
test = test.iloc[:,feat_to_use]
start=tm.time()
Classifier.fit(train, y_train)
tr_time.append(tm.time()-start)
start=tm.time()
predicted = Classifier.predict(test)
ts_time.append(tm.time()-start)
truelab.extend(y_test.values)
pred.extend(predicted)
truelab = pd.DataFrame(truelab)
pred = pd.DataFrame(pred)
tr_time = pd.DataFrame(tr_time)
ts_time = pd.DataFrame(ts_time)
if (NumGenes == 0):
truelab.to_csv("SVM_True_Labels.csv", index = False)
pred.to_csv("SVM_Pred_Labels.csv", index = False)
tr_time.to_csv("SVM_Training_Time.csv", index = False)
ts_time.to_csv("SVM_Testing_Time.csv", index = False)
else:
truelab.to_csv("SVM_" + str(NumGenes) + "_True_Labels.csv", index = False)
pred.to_csv("SVM_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
tr_time.to_csv("SVM_" + str(NumGenes) + "_Training_Time.csv", index = False)
ts_time.to_csv("SVM_" + str(NumGenes) + "_Testing_Time.csv", index = False)
================================================
FILE: Scripts/run_SVM_rejection.py
================================================
import os
import numpy as np
import pandas as pd
import time as tm
from sklearn.svm import LinearSVC
import rpy2.robjects as robjects
from sklearn.calibration import CalibratedClassifierCV
def run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0, Threshold = 0.7):
'''
run baseline classifier: SVM
Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
Threshold : Threshold used when rejecting the cells, default is 0.7.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
labels = labels.iloc[tokeep]
data = data.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
# folder with results
os.chdir(OutputDir)
# normalize data
data = np.log1p(data)
Classifier = LinearSVC()
clf = CalibratedClassifierCV(Classifier)
tr_time=[]
ts_time=[]
truelab = []
pred = []
for i in range(np.squeeze(nfolds)):
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
train=data.iloc[train_ind_i]
test=data.iloc[test_ind_i]
y_train=labels.iloc[train_ind_i]
y_test=labels.iloc[test_ind_i]
if (NumGenes > 0):
feat_to_use = features.iloc[0:NumGenes,i]
train = train.iloc[:,feat_to_use]
test = test.iloc[:,feat_to_use]
start=tm.time()
clf.fit(train, y_train)
tr_time.append(tm.time()-start)
start=tm.time()
predicted = clf.predict(test)
prob = np.max(clf.predict_proba(test), axis = 1)
unlabeled = np.where(prob < Threshold)
predicted[unlabeled] = 'Unknown'
ts_time.append(tm.time()-start)
truelab.extend(y_test.values)
pred.extend(predicted)
truelab = pd.DataFrame(truelab)
pred = pd.DataFrame(pred)
tr_time = pd.DataFrame(tr_time)
ts_time = pd.DataFrame(ts_time)
if (NumGenes == 0):
truelab.to_csv("SVM_True_Labels.csv", index = False)
pred.to_csv("SVM_Pred_Labels.csv", index = False)
tr_time.to_csv("SVM_Training_Time.csv", index = False)
ts_time.to_csv("SVM_Testing_Time.csv", index = False)
else:
truelab.to_csv("SVM_" + str(NumGenes) + "_True_Labels.csv", index = False)
pred.to_csv("SVM_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
tr_time.to_csv("SVM_" + str(NumGenes) + "_Training_Time.csv", index = False)
ts_time.to_csv("SVM_" + str(NumGenes) + "_Testing_Time.csv", index = False)
================================================
FILE: Scripts/run_SingleR.R
================================================
run_SingleR<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
"
run SingleR
Wrapper script to run SingleR on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
"
Data <- read.csv(DataPath,row.names = 1)
Labels <- as.matrix(read.csv(LabelsPath))
load(CV_RDataPath)
Labels <- as.vector(Labels[,col_Index])
Data <- Data[Cells_to_Keep,]
Labels <- Labels[Cells_to_Keep]
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
GenesOrder = read.csv(GeneOrderPath)
}
#############################################################################
# SingleR #
#############################################################################
library(SingleR)
library(Seurat)
True_Labels_SingleR <- list()
Pred_Labels_SingleR <- list()
Total_Time_SingleR <- list()
Data = t(as.matrix(Data))
for (i in c(1:n_folds)){
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
start_time <- Sys.time()
singler = SingleR(method = "single", Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]],
Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]],
Labels[Train_Idx[[i]]], numCores = 1)
end_time <- Sys.time()
}
else{
start_time <- Sys.time()
singler = SingleR(method = "single", Data[,Test_Idx[[i]]], Data[,Train_Idx[[i]]], Labels[Train_Idx[[i]]], numCores = 1)
end_time <- Sys.time()
}
Total_Time_SingleR[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
True_Labels_SingleR[i] <- list(Labels[Test_Idx[[i]]])
Pred_Labels_SingleR[i] <- list(as.vector(singler$labels))
}
True_Labels_SingleR <- as.vector(unlist(True_Labels_SingleR))
Pred_Labels_SingleR <- as.vector(unlist(Pred_Labels_SingleR))
Total_Time_SingleR <- as.vector(unlist(Total_Time_SingleR))
setwd(OutputDir)
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
write.csv(True_Labels_SingleR,paste('SingleR_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
write.csv(Pred_Labels_SingleR,paste('SingleR_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
write.csv(Total_Time_SingleR,paste('SingleR_',NumGenes,'_Total_Time.csv', sep = ''),row.names = FALSE)
}
else{
write.csv(True_Labels_SingleR,'SingleR_True_Labels.csv',row.names = FALSE)
write.csv(Pred_Labels_SingleR,'SingleR_Pred_Labels.csv',row.names = FALSE)
write.csv(Total_Time_SingleR,'SingleR_Total_Time.csv',row.names = FALSE)
}
}
================================================
FILE: Scripts/run_kNN50.py
================================================
import os
import numpy as np
import pandas as pd
import time as tm
from sklearn.neighbors import KNeighborsClassifier
import rpy2.robjects as robjects
def run_kNN50(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run baseline classifiers: kNN
Wrapper script to run kNN (with k = 50) classifier on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
labels = labels.iloc[tokeep]
data = data.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
# folder with results
os.chdir(OutputDir)
# normalize data
data = np.log1p(data)
Classifier = KNeighborsClassifier(n_neighbors=50)
tr_time=[]
ts_time=[]
truelab = []
pred = []
for i in range(np.squeeze(nfolds)):
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
train=data.iloc[train_ind_i]
test=data.iloc[test_ind_i]
y_train=labels.iloc[train_ind_i]
y_test=labels.iloc[test_ind_i]
if (NumGenes > 0):
feat_to_use = features.iloc[0:NumGenes,i]
train = train.iloc[:,feat_to_use]
test = test.iloc[:,feat_to_use]
start=tm.time()
Classifier.fit(train, y_train)
tr_time.append(tm.time()-start)
start=tm.time()
predicted = Classifier.predict(test)
ts_time.append(tm.time()-start)
truelab.extend(y_test.values)
pred.extend(predicted)
truelab = pd.DataFrame(truelab)
pred = pd.DataFrame(pred)
tr_time = pd.DataFrame(tr_time)
ts_time = pd.DataFrame(ts_time)
if (NumGenes == 0):
truelab.to_csv("kNN50_True_Labels.csv", index = False)
pred.to_csv("kNN50_Pred_Labels.csv", index = False)
tr_time.to_csv("kNN50_Training_Time.csv", index = False)
ts_time.to_csv("kNN50_Testing_Time.csv", index = False)
else:
truelab.to_csv("kNN50_" + str(NumGenes) + "_True_Labels.csv", index = False)
pred.to_csv("kNN50_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
tr_time.to_csv("kNN50_" + str(NumGenes) + "_Training_Time.csv", index = False)
ts_time.to_csv("kNN50_" + str(NumGenes) + "_Testing_Time.csv", index = False)
================================================
FILE: Scripts/run_kNN9.py
================================================
import os
import numpy as np
import pandas as pd
import time as tm
from sklearn.neighbors import KNeighborsClassifier
import rpy2.robjects as robjects
def run_kNN9(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run baseline classifiers: kNN
Wrapper script to run kNN (with k = 9) classifier on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
labels = labels.iloc[tokeep]
data = data.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
# folder with results
os.chdir(OutputDir)
# normalize data
data = np.log1p(data)
Classifier = KNeighborsClassifier(n_neighbors=9)
tr_time=[]
ts_time=[]
truelab = []
pred = []
for i in range(np.squeeze(nfolds)):
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
train=data.iloc[train_ind_i]
test=data.iloc[test_ind_i]
y_train=labels.iloc[train_ind_i]
y_test=labels.iloc[test_ind_i]
if (NumGenes > 0):
feat_to_use = features.iloc[0:NumGenes,i]
train = train.iloc[:,feat_to_use]
test = test.iloc[:,feat_to_use]
start=tm.time()
Classifier.fit(train, y_train)
tr_time.append(tm.time()-start)
start=tm.time()
predicted = Classifier.predict(test)
ts_time.append(tm.time()-start)
truelab.extend(y_test.values)
pred.extend(predicted)
truelab = pd.DataFrame(truelab)
pred = pd.DataFrame(pred)
tr_time = pd.DataFrame(tr_time)
ts_time = pd.DataFrame(ts_time)
if (NumGenes == 0):
truelab.to_csv("kNN9_True_Labels.csv", index = False)
pred.to_csv("kNN9_Pred_Labels.csv", index = False)
tr_time.to_csv("kNN9_Training_Time.csv", index = False)
ts_time.to_csv("kNN9_Testing_Time.csv", index = False)
else:
truelab.to_csv("kNN9_" + str(NumGenes) + "_True_Labels.csv", index = False)
pred.to_csv("kNN9_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
tr_time.to_csv("kNN9_" + str(NumGenes) + "_Training_Time.csv", index = False)
ts_time.to_csv("kNN9_" + str(NumGenes) + "_Testing_Time.csv", index = False)
================================================
FILE: Scripts/run_moana.py
================================================
import os
import pandas as pd
import numpy as np
from moana.core import ExpMatrix
from moana.classify import CellTypeClassifier
import time as tm
import rpy2.robjects as robjects
def run_moana(DataPath, LabelsPath, ClassifierPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run moana
Wrapper script to run moana on a benchmark dataset with a pretrained classifier,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
ClassifierPath : Data file path to the pretrained classifier.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# # read the Rdata file
# robjects.r['load'](CV_RDataPath)
#
# tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
# col = np.array(robjects.r['col_Index'], dtype = 'int')
# col = col - 1
matrix = ExpMatrix.read_tsv(DataPath, sep = ',')
# matrix = matrix.iloc[tokeep]
truelab = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',')
# truelab = truelab.iloc[tokeep]
ct_old = ['CD19+ B','CD14+ Monocyte','CD4+/CD45RA+/CD25- Naive T','CD4+/CD45RO+ Memory','CD8+/CD45RA+ Naive Cytotoxic','Dendritic', 'CD56+ NK']
ct_new = ['B cells','CD14+ monocytes','Naive CD4+ T cells','Memory CD4+ T cells','Naive CD8+ T cells','Dendritic cells','NK cells']
tokeep2 = np.isin(truelab,ct_old)
truelab = truelab[tokeep2]
print(len(truelab))
matrix = matrix.iloc[np.squeeze(tokeep2)]
for i in range(len(ct_old)):
truelab.iloc[truelab == ct_old[i]] = ct_new[i]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
feat_to_use = features.iloc[0:NumGenes,0]
matrix = matrix.iloc[:,feat_to_use]
data = ExpMatrix(X = np.transpose(matrix.X), genes = matrix.cells, cells = matrix.genes)
data.genes.name = 'Genes'
data.cells.name = 'Cells'
data.index.name = 'Genes'
data.columns.name = 'Cells'
clf = CellTypeClassifier.read_pickle(ClassifierPath)
start = tm.time()
predictions = clf.predict(data)
runtime = tm.time() - start
np.asarray(predictions)
pred = pd.DataFrame(predictions)
os.chdir(OutputDir)
if (NumGenes == 0):
truelab.to_csv("moana_True_Labels.csv", index = False)
pred.to_csv("moana_Pred_Labels.csv", index = False)
with open("moana_Total_Time.csv", 'w') as f:
f.write("%f\n" % runtime)
else:
truelab.to_csv("moana_" + str(NumGenes) + "_True_Labels.csv", index = False)
pred.to_csv("moana_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
with open("moana_" + str(NumGenes) + "_Total_Time.csv", 'w') as f:
f.write("%f\n" % runtime)
================================================
FILE: Scripts/run_scID.R
================================================
run_scID<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
"
run scID
Wrapper script to run scID on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
"
Data <- read.csv(DataPath,row.names = 1)
Labels <- as.matrix(read.csv(LabelsPath))
load(CV_RDataPath)
Labels <- as.vector(Labels[,col_Index])
Data <- Data[Cells_to_Keep,]
Labels <- Labels[Cells_to_Keep]
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
GenesOrder = read.csv(GeneOrderPath)
}
#############################################################################
# scID #
#############################################################################
library(scID)
library(Seurat)
True_Labels_scID <- list()
Pred_Labels_scID <- list()
Total_Time_scID <- list()
Data = t(as.matrix(Data))
for (i in c(1:n_folds)){
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
Train_Labels <- list(Labels[Train_Idx[[i]]])
names(Train_Labels[[1]]) <- colnames(Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]])
start_time <- Sys.time()
scID_output <- scid_multiclass(Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]],
Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]],
Train_Labels[[1]])
end_time <- Sys.time()
}
else{
Train_Labels <- list(Labels[Train_Idx[[i]]])
names(Train_Labels[[1]]) <- colnames(Data[,Train_Idx[[i]]])
start_time <- Sys.time()
scID_output <- scid_multiclass(Data[,Test_Idx[[i]]], Data[,Train_Idx[[i]]], Train_Labels[[1]])
end_time <- Sys.time()
}
Total_Time_scID[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
True_Labels_scID[i] <- list(Labels[Test_Idx[[i]]])
Pred_Labels_scID[i] <- list(as.vector(scID_output$labels))
}
True_Labels_scID <- as.vector(unlist(True_Labels_scID))
Pred_Labels_scID <- as.vector(unlist(Pred_Labels_scID))
Total_Time_scID <- as.vector(unlist(Total_Time_scID))
setwd(OutputDir)
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
write.csv(True_Labels_scID,paste('scID_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
write.csv(Pred_Labels_scID,paste('scID_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
write.csv(Total_Time_scID,paste('scID_',NumGenes,'_Total_Time.csv', sep = ''),row.names = FALSE)
}
else{
write.csv(True_Labels_scID,'scID_True_Labels.csv',row.names = FALSE)
write.csv(Pred_Labels_scID,'scID_Pred_Labels.csv',row.names = FALSE)
write.csv(Total_Time_scID,'scID_Total_Time.csv',row.names = FALSE)
}
}
================================================
FILE: Scripts/run_scPred.R
================================================
run_scPred<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
"
run scPred
Wrapper script to run scPred on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
"
Data <- read.csv(DataPath,row.names = 1)
Labels <- as.matrix(read.csv(LabelsPath))
load(CV_RDataPath)
Labels <- as.vector(Labels[,col_Index])
Data <- Data[Cells_to_Keep,]
Labels <- Labels[Cells_to_Keep]
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
GenesOrder = read.csv(GeneOrderPath)
}
#############################################################################
# scPred #
#############################################################################
library(scPred)
library(tidyverse)
library(SingleCellExperiment)
True_Labels_scPred <- list()
Pred_Labels_scPred <- list()
Training_Time_scPred <- list()
Testing_Time_scPred <- list()
Data = t(as.matrix(Data))
for (i in c(1:n_folds)){
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]),
colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
sce_counts <- normcounts(sce)
sce_cpm <- apply(sce_counts, 2, function(x) (x/sum(x))*1000000)
sce_metadata <- as.data.frame(colData(sce))
sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]),
colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
sce_counts_test <- normcounts(sce_test)
sce_cpm_test <- apply(sce_counts_test, 2, function(x) (x/sum(x))*1000000)
sce_metadata_test <- as.data.frame(colData(sce_test))
}
else{
sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]),
colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
sce_counts <- normcounts(sce)
sce_cpm <- apply(sce_counts, 2, function(x) (x/sum(x))*1000000)
sce_metadata <- as.data.frame(colData(sce))
sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]),
colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
sce_counts_test <- normcounts(sce_test)
sce_cpm_test <- apply(sce_counts_test, 2, function(x) (x/sum(x))*1000000)
sce_metadata_test <- as.data.frame(colData(sce_test))
}
# scPred Training
start_time <- Sys.time()
set.seed(1234)
scp <- eigenDecompose(sce_cpm)
scPred::metadata(scp) <- sce_metadata
scp <- getFeatureSpace(scp, pVar = 'cell_type1')
# plotEigen(scp, group = 'cell_type1')
scp <- trainModel(scp)
# plotTrainProbs(scp)
end_time <- Sys.time()
Training_Time_scPred[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
# scPred Prediction
start_time <- Sys.time()
scp <- scPredict(scp,newData = sce_cpm_test)
end_time <- Sys.time()
Testing_Time_scPred[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
True_Labels_scPred[i] <- list(Labels[Test_Idx[[i]]])
Pred_Labels_scPred[i] <- list(getPredictions(scp)$predClass)
}
True_Labels_scPred <- as.vector(unlist(True_Labels_scPred))
Pred_Labels_scPred <- as.vector(unlist(Pred_Labels_scPred))
Training_Time_scPred <- as.vector(unlist(Training_Time_scPred))
Testing_Time_scPred <- as.vector(unlist(Testing_Time_scPred))
setwd(OutputDir)
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
write.csv(True_Labels_scPred,paste('scPred_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
write.csv(Pred_Labels_scPred,paste('scPred_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
write.csv(Training_Time_scPred,paste('scPred_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)
write.csv(Testing_Time_scPred,paste('scPred_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)
}
else{
write.csv(True_Labels_scPred,'scPred_True_Labels.csv',row.names = FALSE)
write.csv(Pred_Labels_scPred,'scPred_Pred_Labels.csv',row.names = FALSE)
write.csv(Training_Time_scPred,'scPred_Training_Time.csv',row.names = FALSE)
write.csv(Testing_Time_scPred,'scPred_Testing_Time.csv',row.names = FALSE)
}
}
================================================
FILE: Scripts/run_scVI.py
================================================
from scvi.dataset import CsvDataset
import os
import numpy as np
import pandas as pd
from scvi.models import SCANVI
from scvi.inference import SemiSupervisedTrainer
import time as tm
import rpy2.robjects as robjects
def run_scVI(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run scVI
Wrapper script to run scVI on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
labels = labels.iloc[tokeep]
data = data.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
os.chdir(OutputDir)
if (NumGenes == 0):
#save labels as csv file with header and index column
labels.to_csv('Labels_scvi.csv')
data.to_csv('Data_scvi.csv')
train = CsvDataset('Data_scvi.csv', save_path = "", sep = ",", labels_file = "Labels_scvi.csv", gene_by_cell = False)
## this semisupervised trainer automatically uses a part of the input data for training and a part for testing
scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels)
trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5)
n_epochs = 200
truelab = []
pred = []
tr_time = []
ts_time = []
for i in range(np.squeeze(nfolds)):
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
if (NumGenes > 0):
feat_to_use = features.iloc[0:NumGenes,i]
data2 = data.iloc[:,feat_to_use]
labels.to_csv('Labels_scvi.csv')
data2.to_csv('Data_scvi.csv')
train = CsvDataset('Data_scvi.csv', save_path = "", sep = ",", labels_file = "Labels_scvi.csv", gene_by_cell = False, new_n_genes = False)
## this semisupervised trainer automatically uses a part of the input data for training and a part for testing
scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels)
trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5)
trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(indices=(train_ind_i).ravel(), shuffle = False)
trainer_scanvi.labelled_set.to_monitor = ['ll','accuracy']
trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(indices=(test_ind_i).ravel(), shuffle = False)
trainer_scanvi.unlabelled_set.to_monitor = ['ll','accuracy']
start = tm.time()
trainer_scanvi.train(n_epochs)
tr_time.append(tm.time()-start)
## labels of test set are in y_pred
## labels are returned in numbers, should be mapped back to the real labels
## indices are permutated
start = tm.time()
y_true, y_pred = trainer_scanvi.unlabelled_set.compute_predictions()
ts_time.append(tm.time()-start)
truelab.extend(y_true)
pred.extend(y_pred)
#write results
truelab = pd.DataFrame(truelab)
pred = pd.DataFrame(pred)
tr_time = pd.DataFrame(tr_time)
ts_time = pd.DataFrame(ts_time)
if (NumGenes == 0):
truelab.to_csv("scVI_True_Labels.csv", index = False)
pred.to_csv("scVI_Pred_Labels.csv", index = False)
tr_time.to_csv("scVI_Training_Time.csv", index = False)
ts_time.to_csv("scVI_Testing_Time.csv", index = False)
else:
truelab.to_csv("scVI_" + str(NumGenes) + "_True_Labels.csv", index = False)
pred.to_csv("scVI_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
tr_time.to_csv("scVI_" + str(NumGenes) + "_Training_Time.csv", index = False)
ts_time.to_csv("scVI_" + str(NumGenes) + "_Testing_Time.csv", index = False)
================================================
FILE: Scripts/run_scmap.R
================================================
run_scmap <- function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
"
run scmap
Wrapper script to run scmap on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
"
Data <- read.csv(DataPath,row.names = 1)
Labels <- as.matrix(read.csv(LabelsPath))
load(CV_RDataPath)
Labels <- as.vector(Labels[,col_Index])
Data <- Data[Cells_to_Keep,]
Labels <- Labels[Cells_to_Keep]
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
GenesOrder = read.csv(GeneOrderPath)
}
#############################################################################
# scmap #
#############################################################################
library(scmap)
library(SingleCellExperiment)
True_Labels_scmapcluster <- list()
Pred_Labels_scmapcluster <- list()
True_Labels_scmapcell <- list()
Pred_Labels_scmapcell <- list()
Training_Time_scmapcluster <- list()
Testing_Time_scmapcluster <- list()
Training_Time_scmapcell <- list()
Testing_Time_scmapcell <- list()
Data = t(as.matrix(Data))
for (i in c(1:n_folds)){
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]),
colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
logcounts(sce) <- log2(normcounts(sce) + 1)
# use gene names as feature symbols
rowData(sce)$feature_symbol <- rownames(sce)
sce <- selectFeatures(sce, n_features = NumGenes, suppress_plot = TRUE)
sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]),
colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
logcounts(sce_test) <- log2(normcounts(sce_test) + 1)
rowData(sce_test)$feature_symbol <- rownames(sce_test)
sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData
}
else{
sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]),
colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
logcounts(sce) <- log2(normcounts(sce) + 1)
# use gene names as feature symbols
rowData(sce)$feature_symbol <- rownames(sce)
sce <- selectFeatures(sce, suppress_plot = TRUE)
sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]),
colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
logcounts(sce_test) <- log2(normcounts(sce_test) + 1)
rowData(sce_test)$feature_symbol <- rownames(sce_test)
sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData
}
# scmap-cluster
start_time <- Sys.time()
sce <- indexCluster(sce)
end_time <- Sys.time()
Training_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
start_time <- Sys.time()
scmapCluster_results <- scmapCluster(projection = sce_test,index_list = list(metadata(sce)$scmap_cluster_index))
end_time <- Sys.time()
Testing_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
True_Labels_scmapcluster[i] <- list(Labels[Test_Idx[[i]]])
Pred_Labels_scmapcluster[i] <- list(scmapCluster_results$combined_labs)
# scmap-cell
start_time <- Sys.time()
set.seed(1)
sce <- indexCell(sce)
end_time <- Sys.time()
Training_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
start_time <- Sys.time()
scmapCell_results <- scmapCell(sce_test,list(metadata(sce)$scmap_cell_index))
scmapCell_clusters <- scmapCell2Cluster(scmapCell_results,list(as.character(colData(sce)$cell_type1)))
end_time <- Sys.time()
Testing_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
True_Labels_scmapcell[i] <- list(Labels[Test_Idx[[i]]])
Pred_Labels_scmapcell[i] <- list(scmapCell_clusters$combined_labs)
}
True_Labels_scmapcluster <- as.vector(unlist(True_Labels_scmapcluster))
Pred_Labels_scmapcluster <- as.vector(unlist(Pred_Labels_scmapcluster))
True_Labels_scmapcell <- as.vector(unlist(True_Labels_scmapcell))
Pred_Labels_scmapcell <- as.vector(unlist(Pred_Labels_scmapcell))
Training_Time_scmapcluster <- as.vector(unlist(Training_Time_scmapcluster))
Testing_Time_scmapcluster <- as.vector(unlist(Testing_Time_scmapcluster))
Training_Time_scmapcell <- as.vector(unlist(Training_Time_scmapcell))
Testing_Time_scmapcell <- as.vector(unlist(Testing_Time_scmapcell))
setwd(OutputDir)
if (!is.null(GeneOrderPath) & !is.null (NumGenes)){
write.csv(True_Labels_scmapcluster,paste('scmapcluster_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
write.csv(Pred_Labels_scmapcluster,paste('scmapcluster_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
write.csv(True_Labels_scmapcell,paste('scmapcell_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
write.csv(Pred_Labels_scmapcell,paste('scmapcell_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
write.csv(Training_Time_scmapcluster,paste('scmapcluster_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)
write.csv(Testing_Time_scmapcluster,paste('scmapcluster_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)
write.csv(Training_Time_scmapcell,paste('scmapcell_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)
write.csv(Testing_Time_scmapcell,paste('scmapcell_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)
}
else{
write.csv(True_Labels_scmapcluster,'scmapcluster_True_Labels.csv',row.names = FALSE)
write.csv(Pred_Labels_scmapcluster,'scmapcluster_Pred_Labels.csv',row.names = FALSE)
write.csv(True_Labels_scmapcell,'scmapcell_True_Labels.csv',row.names = FALSE)
write.csv(Pred_Labels_scmapcell,'scmapcell_Pred_Labels.csv',row.names = FALSE)
write.csv(Training_Time_scmapcluster,'scmapcluster_Training_Time.csv',row.names = FALSE)
write.csv(Testing_Time_scmapcluster,'scmapcluster_Testing_Time.csv',row.names = FALSE)
write.csv(Training_Time_scmapcell,'scmapcell_Training_Time.csv',row.names = FALSE)
write.csv(Testing_Time_scmapcell,'scmapcell_Testing_Time.csv',row.names = FALSE)
}
}
================================================
FILE: Scripts/run_singleCellNet.R
================================================
run_singleCellNet<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
"
run singleCellNet
Wrapper script to run singleCellNet on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
"
Data <- read.csv(DataPath,row.names = 1)
colnames(Data) <- gsub('_','.',colnames(Data), fixed = TRUE)
Labels <- as.matrix(read.csv(LabelsPath))
load(CV_RDataPath)
Labels <- as.vector(Labels[,col_Index])
Data <- Data[Cells_to_Keep,]
Labels <- Labels[Cells_to_Keep]
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
GenesOrder = read.csv(GeneOrderPath)
}
#############################################################################
# singleCellNet #
#############################################################################
library(singleCellNet)
library(dplyr)
True_Labels_singleCellNet <- list()
Pred_Labels_singleCellNet <- list()
Training_Time_singleCellNet <- list()
Testing_Time_singleCellNet <- list()
Data = t(as.matrix(Data)) # deals also with sparse matrix
for(i in c(1:n_folds)){
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
DataTrain <- Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]
DataTest <- Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]
}
else{
DataTrain <- Data[,Train_Idx[[i]]]
DataTest <- Data[,Test_Idx[[i]]]
}
start_time <- Sys.time()
cgenes2<-findClassyGenes(DataTrain, data.frame(Annotation = Labels[Train_Idx[[i]]]), "Annotation")
cgenesA<-cgenes2[['cgenes']]
grps<-cgenes2[['grps']]
DataTrain<-as.matrix(DataTrain[cgenesA,])
xpairs<-ptGetTop(DataTrain, grps, ncores = 1)
pdTrain<-query_transform(DataTrain[cgenesA, ], xpairs)
rf<-sc_makeClassifier(pdTrain[xpairs,], genes=xpairs, groups=grps)
end_time <- Sys.time()
Training_Time_singleCellNet[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
start_time <- Sys.time()
DataTest<-query_transform(DataTest[cgenesA,], xpairs)
classRes <-rf_classPredict(rf, DataTest)
end_time <- Sys.time()
Testing_Time_singleCellNet[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
True_Labels_singleCellNet[i] <- list(Labels[Test_Idx[[i]]])
Pred_Labels_singleCellNet[i] <- list((rownames(classRes)[apply(classRes,2,which.max)])[1:length(Test_Idx[[i]])])
}
True_Labels_singleCellNet <- as.vector(unlist(True_Labels_singleCellNet))
Pred_Labels_singleCellNet <- as.vector(unlist(Pred_Labels_singleCellNet))
Training_Time_singleCellNet <- as.vector(unlist(Training_Time_singleCellNet))
Testing_Time_singleCellNet <- as.vector(unlist(Testing_Time_singleCellNet))
setwd(OutputDir)
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
write.csv(True_Labels_singleCellNet,paste('singleCellNet_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
write.csv(Pred_Labels_singleCellNet,paste('singleCellNet_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
write.csv(Training_Time_singleCellNet,paste('singleCellNet_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)
write.csv(Testing_Time_singleCellNet,paste('singleCellNet_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)
}
else{
write.csv(True_Labels_singleCellNet,'singleCellNet_True_Labels.csv',row.names = FALSE)
write.csv(Pred_Labels_singleCellNet,'singleCellNet_Pred_Labels.csv',row.names = FALSE)
write.csv(Training_Time_singleCellNet,'singleCellNet_Training_Time.csv',row.names = FALSE)
write.csv(Testing_Time_singleCellNet,'singleCellNet_Testing_Time.csv',row.names = FALSE)
}
}
================================================
FILE: Snakemake/Cross_Validation.R
================================================
args <- commandArgs(TRUE)
Cross_Validation <- function(LabelsPath, col_Index = 1, OutputDir){
"
Cross_Validation
Function returns train and test indices for 5 folds stratified across unique cell populations,
also filter out cell populations with less than 10 cells.
It return a 'CV_folds.RData' file which then used as input to classifiers wrappers.
Parameters
----------
LabelsPath : Cell population annotations file path (.csv).
col_Index : column index (integer) defining which level of annotation to use,
in case of multiple cell type annotations (default is 1)
OutputDir : Output directory defining the path of the exported file.
"
Labels <- as.matrix(read.csv(LabelsPath))
Labels <- as.vector(Labels[,col_Index])
Removed_classes <- !(table(Labels) > 10)
Cells_to_Keep <- !(is.element(Labels,names(Removed_classes)[Removed_classes]))
Labels <- Labels[Cells_to_Keep]
# Getting training and testing Folds
library(rBayesianOptimization)
n_folds = 5
Folds <- KFold(Labels,nfolds = n_folds, stratified = TRUE)
Test_Folds <- c(n_folds:1)
Train_Idx <- list()
Test_Idx <- list()
for (i in c(1:length(Folds))){
Temp_Folds <- Folds
Temp_Folds[Test_Folds[i]] <- NULL
Train_Idx[i] <- list(unlist(Temp_Folds))
Test_Idx[i] <- Folds[Test_Folds[i]]
}
remove(Temp_Folds,i,Folds)
save(n_folds,Train_Idx,Test_Idx,col_Index,Cells_to_Keep,file = paste0(OutputDir, '/CV_folds.RData'))
}
Cross_Validation(args[1], as.numeric(args[2]), args[3])
================================================
FILE: Snakemake/DEgenesMAST.R
================================================
DEgenesMAST <- function(Data, Labels, Normalize = FALSE, LogTransform = FALSE){
# This functions applies a differential expression test to the data using one vs all
# The training data should be used a an input
# The output is a matrix with marker genes where the columns are the cell populations and the rows are the top20 marker genes
# This output can be rewritten to the format of the prior-knowledge-supervised classifiers and afterwards be used to classify the test set.
# Data: genes X cells (rows = genes, columns = cells)
# Labels: labels of the data
# Normalize: the input for MAST should be cpm normalized data,
# if the data is not normalized yet, this should be set to TRUE
# LogTransform: the input for MAST should be logtransformed,
# if the data is not logtransformed yet, this should be set to TRUE
library(Seurat)
if(Normalize)
{
Data <- apply(Data, 2, function(x) (x/sum(x))*1000000)
}
if(LogTransform)
{
Data <- log(Data+1, base = 2)
}
SeuObj <- CreateSeuratObject(raw.data = Data, project = "DEgenes")
SeuObj <- SetIdent(SeuObj, ident.use = Labels)
DEgenes <- FindAllMarkers(SeuObj, test.use = "MAST")
Markers <- matrix(nrow = 20,ncol = length(unique(Labels)))
colnames(Markers) <- unique(Labels)
for (i in unique(Labels)){
i
TempList <- DEgenes$gene[((DEgenes$cluster == i) & (DEgenes$avg_logFC > 0))]
MarkerGenes <- DEgenes$p_val_adj[DEgenes$cluster == i]
print(MarkerGenes[1:20])
if (length(TempList) >= 20){
Markers[,i] <- TempList[1:20]
}
else{
if(length(TempList) > 0){
Markers[c(1:length(TempList)),i] <- TempList
}
}
}
return(Markers)
}
================================================
FILE: Snakemake/Dockerfiles/baseline/Dockerfile
================================================
FROM debian:9.9-slim
# Install newest R version
RUN apt-get update && \
apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
apt-get update && \
apt-get install --no-install-recommends --yes r-base && \
apt-get purge --yes wget gnupg apt-transport-https && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Install python
RUN apt-get update && \
apt-get install --no-install-recommends --yes python3 python3-pip && \
pip3 --no-cache-dir install setuptools && \
pip3 --no-cache-dir install pandas rpy2 scikit-learn statsmodels && \
rm -rf /var/lib/apt/lists/*
COPY Scripts/run_kNN50.py \
Scripts/run_kNN9.py \
Scripts/run_LDA.py \
Scripts/run_LDA_rejection.py \
Scripts/run_NMC.py \
Scripts/run_RF.py \
Scripts/run_SVM.py \
Scripts/run_SVM_rejection.py \
rank_gene_dropouts.py \
/Scripts/
================================================
FILE: Snakemake/Dockerfiles/cell_blast/Dockerfile
================================================
FROM python:3.7-slim-stretch
# Install newest R version
RUN apt-get update && \
apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
apt-get update && \
apt-get install --no-install-recommends --yes r-base && \
apt-get purge --yes wget gnupg apt-transport-https && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Install python and pip deps
RUN apt-get update && apt-get install --no-install-recommends --yes build-essential python3-dev libxml2 libxml2-dev zlib1g-dev && \
pip3 --no-cache-dir install --upgrade pip && \
pip3 --no-cache-dir install --upgrade setuptools && \
pip3 --no-cache-dir install pandas rpy2 scikit-learn statsmodels tensorflow Cell-BLAST && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
COPY Scripts/run_Cell_BLAST.py /Scripts/
================================================
FILE: Snakemake/Dockerfiles/chetah/Dockerfile
================================================
FROM debian:9.9-slim
# Install newest R version
RUN apt-get update && \
apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
apt-get update && \
apt-get install --no-install-recommends --yes r-base && \
apt-get purge --yes wget gnupg apt-transport-https && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
COPY Scripts/run_CHETAH.R \
Dockerfiles/chetah/install_packages.R \
/Scripts/
# Install R packages
RUN apt-get update && \
apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
Rscript --vanilla /Scripts/install_packages.R && \
apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
================================================
FILE: Snakemake/Dockerfiles/chetah/install_packages.R
================================================
withCallingHandlers({
install.packages("devtools", repos="https://cloud.r-project.org/")
install.packages("BiocManager", repos="https://cloud.r-project.org/")
BiocManager::install(c("bioDist", "ggplot2", "gplots", "cowplot",
"dendextend", "corrplot", "reshape2", "plotly"))
devtools::install_github("jdekanter/CHETAH", ref="b777e6f671bff3c434842adb655869a52bc9e368")
},
warning = function(w) stop(w))
================================================
FILE: Snakemake/Dockerfiles/cross_validation/Dockerfile
================================================
FROM debian:9.9-slim
# Install newest R version
RUN apt-get update && \
apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
apt-get update && \
apt-get install --no-install-recommends --yes r-base && \
apt-get purge --yes wget gnupg apt-transport-https && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
COPY Cross_Validation.R \
Dockerfiles/cross_validation/install_packages.R \
/Scripts/
# Install R packages
RUN apt-get update && \
apt-get install --no-install-recommends --yes make gcc libc6-dev g++ libxml2-dev && \
Rscript --vanilla /Scripts/install_packages.R && \
apt-get purge --yes make gcc g++ libxml2-dev && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
================================================
FILE: Snakemake/Dockerfiles/cross_validation/install_packages.R
================================================
withCallingHandlers({
install.packages("lhs", repos="https://cloud.r-project.org/")
install.packages("rBayesianOptimization", repos="https://cloud.r-project.org/")
},
warning = function(w) stop(w))
================================================
FILE: Snakemake/Dockerfiles/garnett/Dockerfile
================================================
FROM debian:9.9-slim
# Install newest R version
RUN apt-get update && \
apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
apt-get update && \
apt-get install --no-install-recommends --yes r-base && \
apt-get purge --yes wget gnupg apt-transport-https && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
COPY Scripts/run_Garnett_CV.R \
Scripts/run_Garnett_Pretrained.R \
Dockerfiles/garnett/install_packages.R \
/Scripts/
# Install R packages
RUN apt-get update && \
apt-get install --no-install-recommends --yes make gcc g++ libxml2-dev zlib1g-dev gfortran liblapack-dev libcurl4-gnutls-dev libssl-dev && \
Rscript --vanilla /Scripts/install_packages.R && \
apt-get purge --yes make gcc g++ zlib1g-dev gfortran liblapack-dev libcurl4-gnutls-dev libssl-dev && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
================================================
FILE: Snakemake/Dockerfiles/garnett/install_packages.R
================================================
withCallingHandlers({
install.packages("BiocManager", repos="https://cloud.r-project.org/")
BiocManager::install(c("monocle", "DelayedArray", "DelayedMatrixStats",
"org.Hs.eg.db", "org.Mm.eg.db"))
install.packages("devtools", repos="https://cloud.r-project.org/")
devtools::install_github("cole-trapnell-lab/garnett", ref="9804b532bbcc1714b3ed0b718cf430741f1dba6c")
},
warning = function(w) stop(w))
================================================
FILE: Snakemake/Dockerfiles/scid/Dockerfile
================================================
FROM r-base:3.6.0
COPY Scripts/run_scID.R \
Dockerfiles/scid/install_packages.R \
/Scripts/
# Install R packages
RUN apt-get update && \
apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
Rscript --vanilla /Scripts/install_packages.R && \
apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
================================================
FILE: Snakemake/Dockerfiles/scid/install_packages.R
================================================
withCallingHandlers({
install.packages("BiocManager", repos="https://cloud.r-project.org/")
BiocManager::install(ask = FALSE);
BiocManager::install(c("scater", "MAST"))
install.packages("devtools", repos="https://cloud.r-project.org/")
devtools::install_github("satijalab/seurat")
devtools::install_github("BatadaLab/scID")
},
warning = function(w) stop(w))
================================================
FILE: Snakemake/Dockerfiles/scmap/Dockerfile
================================================
FROM r-base:3.6.0
COPY Scripts/run_scmapcell.R \
Scripts/run_scmapcluster.R \
Dockerfiles/scmap/install_packages.R \
/Scripts/
# Install R packages
RUN apt-get update && \
apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
Rscript --vanilla /Scripts/install_packages.R && \
apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
================================================
FILE: Snakemake/Dockerfiles/scmap/install_packages.R
================================================
withCallingHandlers({
install.packages("BiocManager", repos="https://cloud.r-project.org/")
BiocManager::install(ask = FALSE)
BiocManager::install("SingleCellExperiment")
install.packages("devtools", repos="https://cloud.r-project.org/")
devtools::install_github("hemberg-lab/scmap")
},
warning = function(w) stop(w))
================================================
FILE: Snakemake/Dockerfiles/scvi/Dockerfile
================================================
FROM python:3.7-slim-stretch
# Install newest R version
RUN apt-get update && \
apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
apt-get update && \
apt-get install --no-install-recommends --yes r-base && \
apt-get purge --yes wget gnupg apt-transport-https && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Install python and pip deps
RUN apt-get update && apt-get install --no-install-recommends --yes build-essential python3-dev libxml2 libxml2-dev zlib1g-dev && \
pip3 --no-cache-dir install --upgrade pip && \
pip3 --no-cache-dir install --upgrade setuptools && \
pip3 --no-cache-dir install pandas rpy2 scikit-learn statsmodels tensorflow scvi && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
COPY Scripts/run_scVI.py /Scripts/
================================================
FILE: Snakemake/Dockerfiles/singlecellnet/Dockerfile
================================================
FROM debian:9.9-slim
# Install newest R version
RUN apt-get update && \
apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
apt-get update && \
apt-get install --no-install-recommends --yes r-base && \
apt-get purge --yes wget gnupg apt-transport-https && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
COPY Scripts/run_singleCellNet.R \
Dockerfiles/singlecellnet/install_packages.R \
/Scripts/
# Install R packages
RUN apt-get update && \
apt-get install --no-install-recommends --yes make gcc libc6-dev g++ libcurl4-openssl-dev zlib1g-dev libssl-dev r-base-dev libxml2-dev && \
Rscript --vanilla /Scripts/install_packages.R && \
apt-get purge --yes make gcc g++ zlib1g-dev libcurl4-openssl-dev libc6-dev libssl-dev r-base-dev libxml2-dev && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
================================================
FILE: Snakemake/Dockerfiles/singlecellnet/install_packages.R
================================================
withCallingHandlers({
install.packages("devtools", repos="https://cloud.r-project.org/")
install.packages("BiocManager", repos="https://cloud.r-project.org/")
BiocManager::install("fgsea")
devtools::install_github("thomasp85/patchwork", ref="fd7958bae3e7a1e30237c751952e412a0a1d1242")
devtools::install_github("pcahan1/singleCellNet", ref="4279a68112743b783cc82628421dd703261ec117")
},
warning = function(w) stop(w))
================================================
FILE: Snakemake/Dockerfiles/singler/Dockerfile
================================================
FROM debian:9.9-slim
# Install newest R version
RUN apt-get update && \
apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
apt-get update && \
apt-get install --no-install-recommends --yes r-base && \
apt-get purge --yes wget gnupg apt-transport-https && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
COPY Scripts/run_SingleR.R \
Dockerfiles/singler/install_packages.R \
/Scripts/
RUN apt-get update && \
apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev libxml2 && \
Rscript --vanilla /Scripts/install_packages.R && \
apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
================================================
FILE: Snakemake/Dockerfiles/singler/install_packages.R
================================================
withCallingHandlers({
install.packages("devtools", repos="https://cloud.r-project.org/")
install.packages("Seurat", repos="https://cloud.r-project.org/")
devtools::install_github("dviraran/SingleR", ref="db4823b380ba2c3142c857c8c0695200dd1736f6")
},
warning = function(w) stop(w))
================================================
FILE: Snakemake/LICENSE
================================================
MIT License
Copyright (c) 2019 tabdelaal
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: Snakemake/README.md
================================================
# scRNAseq_Benchmark
Benchmarking classification tools for scRNA-seq data
## How to use
[snakemake](https://snakemake.readthedocs.io/en/stable/index.html) and
[singularity](https://www.sylabs.io/docs/) need to be available on your
system. You will need to run this on a linux system, as singularity
only supports linux.
From the root of this repository:
```
snakemake \
--configfile \
--use-singularity
```
If your data or output directory is not located under the root of this
repository, be sure to tell snakemake to mount the appropriate directories
in singularity:
```
snakemake \
--configfile \
--use-singularity \
--singularity-args '--bind : --bind