Showing preview only (310K chars total). Download the full file or copy to clipboard to get everything.
Repository: tabdelaal/scRNAseq_Benchmark
Branch: master
Commit: 553869b632f4
Files: 82
Total size: 288.4 KB
Directory structure:
gitextract_ikyozzhh/
├── Cross_Validation.R
├── DEgenesMAST.R
├── LICENSE
├── README.md
├── Scripts/
│ ├── run_ACTINN.py
│ ├── run_CHETAH.R
│ ├── run_CaSTLe.R
│ ├── run_Cell_BLAST.py
│ ├── run_DigitalCellSorter.py
│ ├── run_Garnett_CV.R
│ ├── run_Garnett_Pretrained.R
│ ├── run_LAmbDA.py
│ ├── run_LDA.py
│ ├── run_LDA_rejection.py
│ ├── run_NMC.py
│ ├── run_RF.py
│ ├── run_SCINA.R
│ ├── run_SVM.py
│ ├── run_SVM_rejection.py
│ ├── run_SingleR.R
│ ├── run_kNN50.py
│ ├── run_kNN9.py
│ ├── run_moana.py
│ ├── run_scID.R
│ ├── run_scPred.R
│ ├── run_scVI.py
│ ├── run_scmap.R
│ └── run_singleCellNet.R
├── Snakemake/
│ ├── Cross_Validation.R
│ ├── DEgenesMAST.R
│ ├── Dockerfiles/
│ │ ├── baseline/
│ │ │ └── Dockerfile
│ │ ├── cell_blast/
│ │ │ └── Dockerfile
│ │ ├── chetah/
│ │ │ ├── Dockerfile
│ │ │ └── install_packages.R
│ │ ├── cross_validation/
│ │ │ ├── Dockerfile
│ │ │ └── install_packages.R
│ │ ├── garnett/
│ │ │ ├── Dockerfile
│ │ │ └── install_packages.R
│ │ ├── scid/
│ │ │ ├── Dockerfile
│ │ │ └── install_packages.R
│ │ ├── scmap/
│ │ │ ├── Dockerfile
│ │ │ └── install_packages.R
│ │ ├── scvi/
│ │ │ └── Dockerfile
│ │ ├── singlecellnet/
│ │ │ ├── Dockerfile
│ │ │ └── install_packages.R
│ │ └── singler/
│ │ ├── Dockerfile
│ │ └── install_packages.R
│ ├── LICENSE
│ ├── README.md
│ ├── Scripts/
│ │ ├── run_ACTINN.py
│ │ ├── run_CHETAH.R
│ │ ├── run_CaSTLe.R
│ │ ├── run_Cell_BLAST.py
│ │ ├── run_DigitalCellSorter.py
│ │ ├── run_Garnett_CV.R
│ │ ├── run_Garnett_Pretrained.R
│ │ ├── run_LAmbDA.py
│ │ ├── run_LDA.py
│ │ ├── run_LDA_rejection.py
│ │ ├── run_NMC.py
│ │ ├── run_RF.py
│ │ ├── run_SCINA.R
│ │ ├── run_SVM.py
│ │ ├── run_SVM_rejection.py
│ │ ├── run_SingleR.R
│ │ ├── run_kNN50.py
│ │ ├── run_kNN9.py
│ │ ├── run_moana.py
│ │ ├── run_scID.R
│ │ ├── run_scPred.R
│ │ ├── run_scVI.py
│ │ ├── run_scmap.R
│ │ ├── run_scmapcell.R
│ │ ├── run_scmapcluster.R
│ │ ├── run_scmaptotal.R
│ │ └── run_singleCellNet.R
│ ├── Snakefile
│ ├── evaluate.R
│ ├── example.config.yml
│ └── rank_gene_dropouts.py
├── evaluate.R
└── rank_gene_dropouts.py
================================================
FILE CONTENTS
================================================
================================================
FILE: Cross_Validation.R
================================================
Cross_Validation <- function(LabelsPath, col_Index = 1,OutputDir){
"
Cross_Validation
Function returns train and test indices for 5 folds stratified across unique cell populations,
also filter out cell populations with less than 10 cells.
It return a 'CV_folds.RData' file which then used as input to classifiers wrappers.
Parameters
----------
LabelsPath : Cell population annotations file path (.csv).
col_Index : column index (integer) defining which level of annotation to use,
in case of multiple cell type annotations (default is 1)
OutputDir : Output directory defining the path of the exported file.
"
Labels <- as.matrix(read.csv(LabelsPath))
Labels <- as.vector(Labels[,col_Index])
Removed_classes <- !(table(Labels) > 10)
Cells_to_Keep <- !(is.element(Labels,names(Removed_classes)[Removed_classes]))
Labels <- Labels[Cells_to_Keep]
# Getting training and testing Folds
library(rBayesianOptimization)
n_folds = 5
Folds <- KFold(Labels,nfolds = n_folds, stratified = TRUE)
Test_Folds <- c(n_folds:1)
Train_Idx <- list()
Test_Idx <- list()
for (i in c(1:length(Folds))){
Temp_Folds <- Folds
Temp_Folds[Test_Folds[i]] <- NULL
Train_Idx[i] <- list(unlist(Temp_Folds))
Test_Idx[i] <- Folds[Test_Folds[i]]
}
remove(Temp_Folds,i,Folds)
setwd(OutputDir)
save(n_folds,Train_Idx,Test_Idx,col_Index,Cells_to_Keep,file = 'CV_folds.RData')
}
================================================
FILE: DEgenesMAST.R
================================================
DEgenesMAST <- function(Data, Labels, Normalize = FALSE, LogTransform = FALSE){
# This functions applies a differential expression test to the data using one vs all
# The training data should be used a an input
# The output is a matrix with marker genes where the columns are the cell populations and the rows are the top20 marker genes
# This output can be rewritten to the format of the prior-knowledge-supervised classifiers and afterwards be used to classify the test set.
# Data: genes X cells (rows = genes, columns = cells)
# Labels: labels of the data
# Normalize: the input for MAST should be cpm normalized data,
# if the data is not normalized yet, this should be set to TRUE
# LogTransform: the input for MAST should be logtransformed,
# if the data is not logtransformed yet, this should be set to TRUE
library(Seurat)
if(Normalize)
{
Data <- apply(Data, 2, function(x) (x/sum(x))*1000000)
}
if(LogTransform)
{
Data <- log(Data+1, base = 2)
}
SeuObj <- CreateSeuratObject(raw.data = Data, project = "DEgenes")
SeuObj <- SetIdent(SeuObj, ident.use = Labels)
DEgenes <- FindAllMarkers(SeuObj, test.use = "MAST")
Markers <- matrix(nrow = 20,ncol = length(unique(Labels)))
colnames(Markers) <- unique(Labels)
for (i in unique(Labels)){
i
TempList <- DEgenes$gene[((DEgenes$cluster == i) & (DEgenes$avg_logFC > 0))]
MarkerGenes <- DEgenes$p_val_adj[DEgenes$cluster == i]
print(MarkerGenes[1:20])
if (length(TempList) >= 20){
Markers[,i] <- TempList[1:20]
}
else{
if(length(TempList) > 0){
Markers[c(1:length(TempList)),i] <- TempList
}
}
}
return(Markers)
}
================================================
FILE: LICENSE
================================================
MIT License
Copyright (c) 2019 tabdelaal
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.md
================================================
# A comparison of automatic cell identification methods for single-cell RNA-sequencing data
We present a comprehensive evaluation of the performance of state-of-the-art classification methods, in addition to general-purpose classifiers, for automatic cell identification single cell RNA-sequencing datasets. Our goal is to provide the community with a fair evaluation of all available methods to facilitate the users’ choice as well as direct further developments to focus on the challenging aspects of automated cell type identification. (published in genome biology Sep. 2019 https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1795-z)
### Repository description
We provide all the scripts to run and evaluate all classifiers, and to reproduce the results introduced in the paper.
1. 'Scripts' folder contains a wrapper function to read the data and apply certain classification method.
2. ```Cross_Validation``` R script can be used to produce training and test indices for cross validation.
3. ```rank_gene_dropouts``` Python script can be used to apply feature selection using the dropout method, and rank genes accordingly.
4. ```evaluate``` R script can be used to evaluate the prediction of a certain classifier and obtain scores such as accuracy, median F1-score and % unlabeld cells.
For more details, please check function documentations.
### General Usage
To benchmark and fairly evaluate the performance of different classifiers using benchmark-datasets (Filtered datasets can be downloaded from https://zenodo.org/record/3357167), apply the following steps:
#### Step 1
Apply the ```Cross_Validation``` R function on the corresponding dataset to obtain fixed training and test cell indices, straitified across different cell types. For example, using the Tabula Muris (TM) dataset
```R
Cross_Validation('~/TM/Labels.csv', 1, '~/TM/')
```
This command will create a ```CV_folds.RData``` file used as input in Step 2.
#### Step 2
Run each classifier wrapper. For example, running scPred on TM dataset
```R
run_scPred('~/TM/Filtered_TM_data.csv','~/TM/Labels.csv','~/TM/CV_folds.RData','~/Results/TM/')
```
This command will output the true and predicted cell labels as csv files, as well as the classifier computation time.
#### Step 3
Evaluate the classifier prediction by
```R
result <- evaluate('~/Results/TM/scPred_True_Labels.csv', '~/Results/TM/scPred_Pred_Labels.csv')
```
This command will return the corresponding accuracy, median F1-score, F1-scores for all cell populations, % unlabeled cells, and confusion matrix.
### Usage with feature selection
#### Step 1
Apply the ```Cross_Validation``` R function on the corresponding dataset to obtain fixed training and test cell indices, straitified across different cell types. For example, using the Tabula Muris (TM) dataset
```R
Cross_Validation('~/TM/Labels.csv', 1, '~/TM/')
```
This command will create a ```CV_folds.RData``` file used as input in Step 2 and 3.
#### Step 2
Apply the ```rank_gene_dropouts``` Python script to get the genes ranking for each training fold using the dropout criteria
```
rank_gene_dropouts('~/TM/Filtered_TM_data.csv', '~/TM/CV_folds.RData', '~/TM/')
```
This command will create a ```rank_genes_dropouts.csv``` file used as input in Step 3.
#### Step 3
Run each classifier wrapper. For example, running scPred on TM dataset with 1000 genes
```R
run_scPred('~/TM/Filtered_TM_data.csv','~/TM/Labels.csv','~/TM/CV_folds.RData','~/Results/TM/',
GeneOrderPath = '~/TM/rank_genes_dropouts.csv',NumGenes = 1000)
```
This command will output the true and predicted cell labels as csv files, as well as the classifier computation time.
#### Step 4
Evaluate the classifier prediction by
```R
result <- evaluate('~/Results/TM/scPred_True_Labels.csv', '~/Results/TM/scPred_Pred_Labels.csv')
```
This command will return the corresponding accuracy, median F1-score, F1-scores for all cell populations, % unlabeled cells, and confusion matrix.
### Evaluate Marker-based methods using DE genes
To evaluate the marker-based methods SCINA, DigitalCellSorter and Garnett using DE genes learned from the data, you may follow these steps:
#### Step 1
Apply the ```Cross_Validation``` R function on the corresponding dataset to obtain fixed training and test cell indices, straitified across different cell types. For example, using the Zheng_sorted dataset
```R
Cross_Validation('~/TM/Labels.csv', 1, '~/Zheng_sorted/')
```
This command will create a ```CV_folds.RData``` file used as input in Step 2 and 3.
#### Step 2
For each fold use the training data to get the DE genes using the ```DEgenesMAST``` R function, and pass these DE genes to the corresponding method, for example here we use SCINA, to obtain cell prediction for the test data.
```R
load('CV_folds.RData')
Data <- read.csv('~/Zheng_sorted/Filtered_DownSampled_SortedPBMC_data',row.names = 1)
Labels <- as.matrix(read.csv('~/Zheng_sorted/Labels.csv'))
Labels <- as.vector(Labels[,col_Index])
Data <- Data[Cells_to_Keep,]
Labels <- Labels[Cells_to_Keep]
for (i in c(1:n_folds))
{
MarkerGenes <- DEgenesMAST(t(Data[Train_Idx[[i]],]), Labels[Train_Idx[[i]]], Normalize = TRUE, LogTransform = TRUE)
## write the MarkerGenes into a marker genes file format, depending on the tested method, for example for SCINA
write.csv(MarkerGenes, 'MarkerGenes.csv')
## run the SCINA wrapper using these DE marker genes
run_SCINA(Data[Test_Idx[[i]],], Labels[Test_Idx[[i]]], 'MarkerGenes.csv', '~/Results/Zheng_sorted/')
}
```
### Snakemake
To support future extension of this benchmarking work with new classifiers and datasets, we provide a Snakemake workflow to automate the performed benchmarking analyses (https://github.com/tabdelaal/scRNAseq_Benchmark/tree/snakemake_and_docker).
================================================
FILE: Scripts/run_ACTINN.py
================================================
import os
import numpy as np
import pandas as pd
import time as tm
import rpy2.robjects as robjects
def run_ACTINN(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run ACTINN
Wrapper script to run ACTINN on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
labels = labels.iloc[tokeep]
data = data.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
# folder with results
os.chdir(OutputDir)
tot=[]
truelab = []
pred = []
for i in range(np.squeeze(nfolds)):
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
train=data.iloc[train_ind_i]
test=data.iloc[test_ind_i]
y_train=labels.iloc[train_ind_i]
y_test=labels.iloc[test_ind_i]
if (NumGenes > 0):
feat_to_use = features.iloc[0:NumGenes,i]
train = train.iloc[:,feat_to_use]
test = test.iloc[:,feat_to_use]
train = train.transpose()
test = test.transpose()
train.to_csv("train.csv")
test.to_csv("test.csv")
y_train.to_csv("train_lab.csv", header = False, index = True, sep = '\t')
y_test.to_csv("test_lab.csv", header = False, index = True, sep = '\t')
tm.sleep(60)
os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_format.py -i train.csv -o train -f csv")
os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_format.py -i test.csv -o test -f csv")
start = tm.time()
os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_predict.py -trs train.h5 -trl train_lab.csv -ts test.h5")
tot.append(tm.time()-start)
tm.sleep(60)
truelab.extend(y_test.values)
predlabels = pd.read_csv('predicted_label.txt',header=0,index_col=None, sep='\t', usecols = [1])
pred.extend(predlabels.values)
truelab = pd.DataFrame(truelab)
pred = pd.DataFrame(pred)
tot_time = pd.DataFrame(tot)
if (NumGenes == 0):
truelab.to_csv("ACTINN_True_Labels.csv", index = False)
pred.to_csv("ACTINN_Pred_Labels.csv", index = False)
tot_time.to_csv("ACTINN_Total_Time.csv", index = False)
else:
truelab.to_csv("ACTINN_" + str(NumGenes) + "_True_Labels.csv", index = False)
pred.to_csv("ACTINN_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
tot_time.to_csv("ACTINN_" + str(NumGenes) + "_Total_Time.csv", index = False)
================================================
FILE: Scripts/run_CHETAH.R
================================================
run_CHETAH<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
"
run CHETAH
Wrapper script to run CHETAH on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
"
Data <- read.csv(DataPath,row.names = 1)
Labels <- as.matrix(read.csv(LabelsPath))
load(CV_RDataPath)
Labels <- as.vector(Labels[,col_Index])
Data <- Data[Cells_to_Keep,]
Labels <- Labels[Cells_to_Keep]
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
GenesOrder = read.csv(GeneOrderPath)
}
#############################################################################
# CHETAH #
#############################################################################
library(CHETAH)
library(SingleCellExperiment)
True_Labels_CHETAH <- list()
Pred_Labels_CHETAH <- list()
Total_Time_CHETAH <- list()
Data = t(as.matrix(Data))
for (i in c(1:n_folds)){
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
sce <- SingleCellExperiment(assays = list(counts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]),
colData = data.frame(celltypes = Labels[Train_Idx[[i]]]))
sce_test <- SingleCellExperiment(assays = list(counts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]),
colData = data.frame(celltypes = Labels[Test_Idx[[i]]]))
start_time <- Sys.time()
sce_test <- CHETAHclassifier(input = sce_test, ref_cells = sce, n_genes = NumGenes)
end_time <- Sys.time()
}
else{
sce <- SingleCellExperiment(assays = list(counts = Data[,Train_Idx[[i]]]),
colData = data.frame(celltypes = Labels[Train_Idx[[i]]]))
sce_test <- SingleCellExperiment(assays = list(counts = Data[,Test_Idx[[i]]]),
colData = data.frame(celltypes = Labels[Test_Idx[[i]]]))
start_time <- Sys.time()
sce_test <- CHETAHclassifier(input = sce_test, ref_cells = sce)
end_time <- Sys.time()
}
Total_Time_CHETAH[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
True_Labels_CHETAH[i] <- list(Labels[Test_Idx[[i]]])
Pred_Labels_CHETAH[i] <- list(sce_test$celltype_CHETAH)
}
True_Labels_CHETAH <- as.vector(unlist(True_Labels_CHETAH))
Pred_Labels_CHETAH <- as.vector(unlist(Pred_Labels_CHETAH))
Total_Time_CHETAH <- as.vector(unlist(Total_Time_CHETAH))
setwd(OutputDir)
if (!is.null(GeneOrderPath) & !is.null (NumGenes)){
write.csv(True_Labels_CHETAH,paste('CHETAH_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
write.csv(Pred_Labels_CHETAH,paste('CHETAH_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
write.csv(Total_Time_CHETAH,paste('CHETAH_',NumGenes,'_Total_Time.csv', sep = ''),row.names = FALSE)
}
else{
write.csv(True_Labels_CHETAH,'CHETAH_True_Labels.csv',row.names = FALSE)
write.csv(Pred_Labels_CHETAH,'CHETAH_Pred_Labels.csv',row.names = FALSE)
write.csv(Total_Time_CHETAH,'CHETAH_Total_Time.csv',row.names = FALSE)
}
}
================================================
FILE: Scripts/run_CaSTLe.R
================================================
run_CaSTLe<-function(DataPath,LabelsPath,CV_RDataPath, OutputDir, GeneOrderPath = NULL, NumGenes = NULL){
"
run CaSTLe
Wrapper script to run CaSTLe on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
"
Data <- read.csv(DataPath,row.names = 1)
Labels <- as.matrix(read.csv(LabelsPath))
load(CV_RDataPath)
Labels <- as.vector(Labels[,col_Index])
Data <- Data[Cells_to_Keep,]
Labels <- Labels[Cells_to_Keep]
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
GenesOrder = read.csv(GeneOrderPath)
}
#############################################################################
# CaSTLe #
#############################################################################
library(igraph)
library(xgboost)
True_Labels_Castle <- list()
Pred_Labels_Castle <- list()
Training_Time_Castle <- list()
Testing_Time_Castle <- list()
BREAKS=c(-1, 0, 1, 6, Inf)
nFeatures = 100
for(i in c(1:n_folds)){
# 1. Load datasets
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
ds1 = Data[Train_Idx[[i]],as.vector(GenesOrder[c(1:NumGenes),i])+1]
ds2 = Data[Test_Idx[[i]],as.vector(GenesOrder[c(1:NumGenes),i])+1]
}
else{
ds1 = Data[Train_Idx[[i]],]
ds2 = Data[Test_Idx[[i]],]
}
sourceCellTypes = as.factor(Labels[Train_Idx[[i]]])
targetCellTypes = as.factor(Labels[Test_Idx[[i]]])
start_time <- Sys.time()
# 2. Unify sets, excluding low expressed genes
source_n_cells_counts = apply(ds1, 2, function(x) { sum(x > 0) } )
target_n_cells_counts = apply(ds2, 2, function(x) { sum(x > 0) } )
common_genes = intersect( colnames(ds1)[source_n_cells_counts>10],
colnames(ds2)[target_n_cells_counts>10])
remove(source_n_cells_counts, target_n_cells_counts)
ds1 = ds1[, colnames(ds1) %in% common_genes]
ds2 = ds2[, colnames(ds2) %in% common_genes]
ds = rbind(ds1[,common_genes], ds2[,common_genes])
isSource = c(rep(TRUE,nrow(ds1)), rep(FALSE,nrow(ds2)))
remove(ds1, ds2)
# 3. Highest mean in both source and target
topFeaturesAvg = colnames(ds)[order(apply(ds, 2, mean), decreasing = T)]
end_time <- Sys.time()
Training_Time_Castle[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
start_time <- Sys.time()
# for each cell - what is the most probable classification?
L = length(levels(sourceCellTypes))
targetClassification = as.data.frame(matrix(rep(0,L*sum(!isSource)), nrow=L), row.names = levels(sourceCellTypes))
for (cellType in levels(sourceCellTypes)) {
inSourceCellType = as.factor(ifelse(sourceCellTypes == cellType, cellType, paste0("NOT",cellType)))
# 4. Highest mutual information in source
topFeaturesMi = names(sort(apply(ds[isSource,],2,function(x) { compare(cut(x,breaks=BREAKS),inSourceCellType,method = "nmi") }), decreasing = T))
# 5. Top n genes that appear in both mi and avg
selectedFeatures = union(head(topFeaturesAvg, nFeatures) , head(topFeaturesMi, nFeatures) )
# 6. remove correlated features
tmp = cor(ds[,selectedFeatures], method = "pearson")
tmp[!lower.tri(tmp)] = 0
selectedFeatures = selectedFeatures[apply(tmp,2,function(x) any(x < 0.9))]
remove(tmp)
# 7,8. Convert data from continous to binned dummy vars
# break datasets to bins
dsBins = apply(ds[, selectedFeatures], 2, cut, breaks= BREAKS)
# use only bins with more than one value
nUniq = apply(dsBins, 2, function(x) { length(unique(x)) })
# convert to dummy vars
ds0 = model.matrix(~ . , as.data.frame(dsBins[,nUniq>1]))
remove(dsBins, nUniq)
cat(paste0("<h2>Classifier for ",cellType,"</h2>"))
inTypeSource = sourceCellTypes == cellType
# 9. Classify
xg=xgboost(data=ds0[isSource,] ,
label=inTypeSource,
objective="binary:logistic",
eta=0.7 , nthread=1, nround=20, verbose=0,
gamma=0.001, max_depth=5, min_child_weight=10)
# 10. Predict
inTypeProb = predict(xg, ds0[!isSource, ])
targetClassification[cellType,] = inTypeProb
}
end_time <- Sys.time()
Testing_Time_Castle[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
True_Labels_Castle[i] <- list(Labels[Test_Idx[[i]]])
Pred_Labels_Castle[i] <- list(rownames(targetClassification)[apply(targetClassification,2,which.max)])
}
True_Labels_Castle <- as.vector(unlist(True_Labels_Castle))
Pred_Labels_Castle <- as.vector(unlist(Pred_Labels_Castle))
Training_Time_Castle <- as.vector(unlist(Training_Time_Castle))
Testing_Time_Castle <- as.vector(unlist(Testing_Time_Castle))
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
write.csv(True_Labels_Castle,paste('True_Labels_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)
write.csv(Pred_Labels_Castle,paste('Pred_Labels_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)
write.csv(Training_Time_Castle,paste('Training_Time_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)
write.csv(Testing_Time_Castle,paste('Testing_Time_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)
}
else{
write.csv(True_Labels_Castle,'True_Labels_CaSTLe.csv',row.names = FALSE)
write.csv(Pred_Labels_Castle,'Pred_Labels_CaSTLe.csv',row.names = FALSE)
write.csv(Training_Time_Castle,'Training_Time_CaSTLe.csv',row.names = FALSE)
write.csv(Testing_Time_Castle,'Testing_Time_CaSTLe.csv',row.names = FALSE)
}
}
================================================
FILE: Scripts/run_Cell_BLAST.py
================================================
import os
import time as tm
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import tensorflow as tf
tf.logging.set_verbosity(0)
import Cell_BLAST as cb
import numpy as np
from numpy import genfromtxt as gft
import rpy2.robjects as robjects
def run_Cell_BLAST(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run Cell_BLAST
Wrapper script to run Cell_BLAST on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
# read the data and labels
data_old = cb.data.ExprDataSet.read_table(DataPath,orientation="cg", sep=",", index_col = 0, header = 0, sparsify = True).normalize()
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
data = cb.data.ExprDataSet(data_old.exprs[tokeep],data_old.obs.iloc[tokeep],data_old.var,data_old.uns)
labels = gft(LabelsPath, dtype = "str", skip_header = 1, delimiter = ",", usecols = col)
labels = labels[tokeep]
os.chdir(OutputDir)
truelab = []
pred = []
tr_time = []
ts_time = []
for i in range(np.squeeze(nfolds)):
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
train=data[train_ind_i,:]
test=data[test_ind_i,:]
y_train = labels[train_ind_i]
y_test = labels[test_ind_i]
if (NumGenes > 0):
feat_to_use = features.iloc[0:NumGenes,i]
train = train[:,feat_to_use]
test = test[:,feat_to_use]
train.obs['cell_type'] = y_train
start = tm.time()
# reduce dimensions
num_epoch = 50
models = []
for j in range(4):
models.append(cb.directi.fit_DIRECTi(train, epoch=num_epoch, patience=10, random_seed = j, path="%d" % j))
# train model
blast = cb.blast.BLAST(models, train).build_empirical()
tr_time.append(tm.time()-start)
# predict labels
start = tm.time()
test_pred = blast.query(test).annotate('cell_type')
ts_time.append(tm.time()-start)
truelab.extend(y_test)
pred.extend(test_pred.values)
#write results
truelab = pd.DataFrame(truelab)
pred = pd.DataFrame(pred)
tr_time = pd.DataFrame(tr_time)
ts_time = pd.DataFrame(ts_time)
if (NumGenes == 0):
truelab.to_csv("Cell_BLAST_True_Labels.csv", index = False)
pred.to_csv("Cell_BLAST_Pred_Labels.csv", index = False)
tr_time.to_csv("Cell_BLAST_Training_Time.csv", index = False)
ts_time.to_csv("Cell_BLAST_Testing_Time.csv", index = False)
else:
truelab.to_csv("Cell_BLAST_" + str(NumGenes) + "_True_Labels.csv", index = False)
pred.to_csv("Cell_BLAST_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
tr_time.to_csv("Cell_BLAST_" + str(NumGenes) + "_Training_Time.csv", index = False)
ts_time.to_csv("Cell_BLAST_" + str(NumGenes) + "_Testing_Time.csv", index = False)
================================================
FILE: Scripts/run_DigitalCellSorter.py
================================================
import numpy as np
import pandas as pd
import scripts.DigitalCellSorter as DigitalCellSorter
import os
import time as tm
import rpy2.robjects as robjects
def run_DigitalCellSorter(DataPath, LabelsPath, CV_RDataPath, GeneListPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run DigitalCellSorter
Wrapper script to run DigitalCellSorter on a benchmark dataset using a predefined genelist,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
GeneListPath : Data file path to the genest.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
data = data.iloc[tokeep]
truelab = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
truelab = truelab.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
feat_to_use = features.iloc[0:NumGenes,0]
data = data.iloc[:,feat_to_use]
data = data.transpose()
# number of different cell types in the data?
n_clusters = 8
AvailableCPUsCount = 1
N_samples_for_distribution = 10000
start = tm.time()
pred = DigitalCellSorter.DigitalCellSorter().Process(data, 'DigitalCellSorter_Zhang',
saveDir = OutputDir,
geneListFileName = GeneListPath,
N_samples_for_distribution = N_samples_for_distribution,
AvailableCPUsCount = AvailableCPUsCount,
clusterIndex=None,
clusterName=None,
n_clusters=n_clusters)
runtime = tm.time() - start
os.chdir(OutputDir)
results = pd.read_excel('DigitalCellSorter_Zhang_voting.xlsx',header=0,index_col=None, usecols=[11])
prediction = np.zeros(np.shape(pred), dtype='>U10')
for i in range(len(results)):
prediction[np.where(pred == i)] = results.values[i]
prediction = pd.DataFrame(prediction)
if (NumGenes == 0):
truelab.to_csv("DigitalCellSorter_True_Labels.csv", index = False)
prediction.to_csv("DigitalCellSorter_Pred_Labels.csv", index = False)
with open("DigitalCellSorter_Total_Time.csv", 'w') as f:
f.write("%f\n" % runtime)
else:
truelab.to_csv("DigitalCellSorter_" + str(NumGenes) + "_True_Labels.csv", index = False)
prediction.to_csv("DigitalCellSorter_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
with open("DigitalCellSorter_" + str(NumGenes) + "_Total_Time.csv", 'w') as f:
f.write("%f\n" % runtime)
================================================
FILE: Scripts/run_Garnett_CV.R
================================================
run_Garnett_CV <- function(DataPath, LabelsPath, CV_RDataPath, GenesPath, MarkerPath, OutputDir, Human){
"
run Garnett
Wrapper script to run Garnett on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
GenesPath : Path to the file with the genenames
MarkerPath : Path to the file with marker genes
OutputDir : Output directory defining the path of the exported file.
Human : boolean indicating whether the dataset is human (TRUE) or mouse (FALSE)
"
# load needed libraries
library(garnett)
if (Human) {
library(org.Hs.eg.db)
} else {
library(org.Mm.eg.db)
}
# load the CVFile
load(CV_RDataPath)
# read the labels
labels <- as.matrix(read.csv(LabelsPath))
labels <- as.vector(labels[,col_Index])
labels <- labels[Cells_to_Keep]
# read the data
mat <- read.table(DataPath, sep = ",")
data <- mat[-1,-1]
data <- data[Cells_to_Keep,]
data <- t(data) #ensure that the genes are rows, and the cells are columns
cells <- mat[-1,1]
cells <- cells[Cells_to_Keep]
# read the genefile
fdata <- read.table(GenesPath)
names(fdata) <- 'gene_short_name'
row.names(fdata) <- fdata$gene_short_name
fd <- new("AnnotatedDataFrame", data = fdata)
true_labels <- list()
pred_labels <- list()
train_time <- list()
test_time <- list()
for (i in c(1:n_folds)){
lab_train = labels[Train_Idx[[i]]]
lab_test = labels[Test_Idx[[i]]]
train = data[,Train_Idx[[i]]]
test = data[,Test_Idx[[i]]]
cells_train = cells[Train_Idx[[i]]]
cells_test = cells[Test_Idx[[i]]]
pdata_train = data.frame(cells_train)
pdata_test = data.frame(cells_test)
row.names(train) <- row.names(fdata)
row.names(test) <- row.names(fdata)
colnames(train) <- row.names(pdata_train)
colnames(test) <- row.names(pdata_test)
pd_train <- new("AnnotatedDataFrame", data = pdata_train)
pd_test <- new("AnnotatedDataFrame", data = pdata_test)
pbmc_cds_train <- newCellDataSet(as(train, "dgCMatrix"), phenoData = pd_train, featureData = fd)
pbmc_cds_test <- newCellDataSet(as(test, "dgCMatrix"), phenoData = pd_test, featureData = fd)
pbmc_cds_train <- estimateSizeFactors(pbmc_cds_train)
pbmc_cds_test <- estimateSizeFactors(pbmc_cds_test)
# training
start_train <- Sys.time()
if (Human){
pbmc_classifier <- train_cell_classifier(cds = pbmc_cds_train,
marker_file = MarkerPath,
db=org.Hs.eg.db,
cds_gene_id_type = "SYMBOL",
num_unknown = 50,
marker_file_gene_id_type = "SYMBOL")
} else {
pbmc_classifier <- train_cell_classifier(cds = pbmc_cds_train,
marker_file = MarkerPath,
db=org.Mm.eg.db,
cds_gene_id_type = "SYMBOL",
num_unknown = 50,
marker_file_gene_id_type = "SYMBOL")
}
end_train <- Sys.time()
train_time[i] <- as.numeric(end_train - start_train)
# testing
start_test <- Sys.time()
if (Human) {
pbmc_cds_test <- classify_cells(pbmc_cds_test,
pbmc_classifier,
db = org.Hs.eg.db,
cluster_extend = TRUE,
cds_gene_id_type = "SYMBOL")
} else {
pbmc_cds_test <- classify_cells(pbmc_cds_test,
pbmc_classifier,
db = org.Mm.eg.db,
cluster_extend = TRUE,
cds_gene_id_type = "SYMBOL")
}
end_test <- Sys.time()
test_time[i] <- as.numeric(end_test - start_test)
true_labels[i] <- list(lab_test)
pred_labels[i] <- list(pData(pbmc_cds_test)$cluster_ext_type)
}
true_labels <- as.vector(unlist(true_labels))
pred_labels <- as.vector(unlist(pred_labels))
train_time <- as.vector(unlist(train_time))
test_time <- as.vector(unlist(test_time))
setwd(OutputDir)
write.csv(train_time,'Garnett_CV_Testing_Time.csv',row.names = FALSE)
write.csv(test_time,'Garnett_CV_Training_Time.csv',row.names = FALSE)
write.csv(true_labels, 'Garnett_CV_True_Labels.csv', row.names = FALSE)
write.csv(pred_labels, 'Garnett_CV_Pred_Labels.csv', row.names = FALSE)
}
================================================
FILE: Scripts/run_Garnett_Pretrained.R
================================================
run_Garnett_Pretrained <- function(DataPath, LabelsPath, GenesPath, CV_RDataPath, ClassifierPath, OutputDir, Human){
"
run Garnett
Wrapper script to run Garnett on a benchmark dataset with a pretrained classifier,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
GenesPath : Path to the file with the genenames
ClassifierPath : Path to the pretrained classifier
OutputDir : Output directory defining the path of the exported file.
Human : boolean indicating whether the dataset is human (TRUE) or mouse (FALSE)
"
# load needed libraries
library(garnett)
if (Human) {
library(org.Hs.eg.db)
} else {
library(org.Mm.eg.db)
}
# load data, genes, and marker file
load(CV_RDataPath)
load(ClassifierPath)
labels <- as.matrix(read.csv(LabelsPath))
labels <- labels[Cells_to_Keep]
mat <- read.table(DataPath, sep = ",")
data <- mat[-1,-1]
data <- data[Cells_to_Keep,]
data <- t(data) #ensure that the genes are rows, and the cells are columns
barcodes <- mat[-1,1]
pdata = data.frame(barcodes)
fdata <- read.table(GenesPath)
names(fdata) <- 'gene_short_name'
row.names(fdata) <- fdata$gene_short_name
row.names(data) <- row.names(fdata)
colnames(data) <- row.names(pdata)
pd <- new("AnnotatedDataFrame", data = pdata)
fd <- new("AnnotatedDataFrame", data = fdata)
pbmc_cds <- newCellDataSet(as(data, "dgCMatrix"),
phenoData = pd,
featureData = fd)
start_time <- Sys.time()
pbmc_cds <- estimateSizeFactors(pbmc_cds)
if (Human){
pbmc_cds <- classify_cells(pbmc_cds, hsPBMC, db = org.Hs.eg.db, cluster_extend = TRUE, cds_gene_id_type = "SYMBOL")
} else {
pbmc_cds <- classify_cells(pbmc_cds, mmLung, db = org.Mm.eg.db, cluster_extend = TRUE, cds_gene_id_type = "SYMBOL")
}
end_time <- Sys.time()
test_time <- as.numeric(end_time - start_time)
setwd(OutputDir)
write.table(pData(pbmc_cds)$cluster_ext_type, file = "Garnett_Pred_Labels.csv", append = FALSE, quote = TRUE, sep = "\t",
eol = "\n", na = "NA", dec = ".", row.names = FALSE,
qmethod = c("escape", "double"),
fileEncoding = "")
write.csv(labels,"Garnett_Pretrained_True_Labels.csv", row.names = FALSE)
write.csv(test_time,'Garnett_Pretrained_Testing_Time.csv',row.names = FALSE)
}
================================================
FILE: Scripts/run_LAmbDA.py
================================================
# -*- coding: utf-8 -*-
"""
Created on Thu May 23 13:51:15 2019
@author: Lieke
"""
import os
import numpy as np
import pandas as pd
import time as tm
import rpy2.robjects as robjects
import tensorflow as tf
import math
import scipy.io as sio
import optunity as opt
from tensorflow.contrib.tensor_forest.python import tensor_forest
from tensorflow.python.ops import resources
def run_LAmbDA(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run LAmbDA classifier
Wrapper script to run LAmbDA on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
labels = labels.iloc[tokeep]
data = data.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
# folder with results
os.chdir(OutputDir)
tr_time=[]
ts_time=[]
truelab = np.zeros([len(labels),1],dtype = int)
predlab = np.zeros([len(labels),1],dtype = int)
for i in range(np.squeeze(nfolds)):
global X, Y, Gnp, Dnp, train, test, prt, cv
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
X = np.array(data)
if (NumGenes > 0):
X = np.log2(X/10+1)
feat_to_use = features.iloc[0:NumGenes,i]
X = X[:,feat_to_use]
else:
X = np.log2(np.transpose(select_feats(np.transpose(X),0.5,80))/10+1)
uniq = np.unique(labels)
Y = np.zeros([len(labels),len(uniq)],int)
for j in range(len(uniq)):
Y[np.where(labels == uniq[j])[0],j] = 1
Y = np.array(Y)
Gnp = np.zeros([len(uniq),len(uniq)],int)
np.fill_diagonal(Gnp,1)
Gnp = np.array(Gnp)
Dnp = np.ones([len(uniq),1],int)
Dnp = np.array(Dnp)
train_samp = int(np.floor(0.75*len(train_ind_i)))
test_samp = len(train_ind_i) - train_samp
perm = np.random.permutation(len(train_ind_i))
train = perm[0:train_samp]
test = perm[train_samp:test_samp+1]
while(np.sum(np.sum(Y[train,:],0)<5)>0):
perm = np.random.permutation(X.shape[0])
train = perm[0:train_samp+1]
test = perm[train_samp+1:train_samp+test_samp+1]
cv = i
optunity_it = 0
prt = False
opt_params = None
start=tm.time()
opt_params, _, _ = opt.minimize(run_LAmbDA2,solver_name='sobol', gamma=[0.8,1.2], delta=[0.05,0.95], tau=[10.0,11.0], prc_cut=[20,50], bs_prc=[0.2,0.6], num_trees=[10,200], max_nodes=[100,1000], num_evals=50)
tr_time.append(tm.time()-start)
print("Finished training!")
prt = True
train = train_ind_i
test = test_ind_i
start=tm.time()
err = run_LAmbDA2(opt_params['gamma'], opt_params['delta'], opt_params['tau'], opt_params['prc_cut'], opt_params['bs_prc'], opt_params['num_trees'], opt_params['max_nodes'])
ts_time.append(tm.time()-start)
tf.reset_default_graph();
predfile = 'preds_cv' + str(cv) + '.mat'
truefile = 'truth_cv' + str(cv) + '.mat'
pred = sio.loadmat(predfile)
truth = sio.loadmat(truefile)
pred = pred['preds']
truth = truth['labels']
pred_ind = np.argmax(pred,axis=1)
truth_ind = np.argmax(truth,axis=1)
predlab[test_ind_i,0] = pred_ind
truelab[test_ind_i,0] = truth_ind
truelab = pd.DataFrame(truelab)
predlab = pd.DataFrame(predlab)
tr_time = pd.DataFrame(tr_time)
ts_time = pd.DataFrame(ts_time)
if (NumGenes == 0):
truelab.to_csv("LAmbDA_True_Labels.csv", index = False)
predlab.to_csv("LAmbDA_Pred_Labels.csv", index = False)
tr_time.to_csv("LAmbDA_Training_Time.csv", index = False)
ts_time.to_csv("LAmbDA_Testing_Time.csv", index = False)
else:
truelab.to_csv("LAmbDA_" + str(NumGenes) + "_True_Labels.csv", index = False)
predlab.to_csv("LAmbDA_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
tr_time.to_csv("LAmbDA_" + str(NumGenes) + "_Training_Time.csv", index = False)
ts_time.to_csv("LAmbDA_" + str(NumGenes) + "_Testing_Time.csv", index = False)
##### Functions copied from LAmbDA's Github
def wt_cutoff(colnum,cutoff,Gtmp,gamma):
rowsums = np.sum(Gtmp,axis=1);
return(math.ceil(cutoff*(math.log((max(rowsums)/rowsums[colnum])+1,2)**gamma)))
def resample(prc_cut,Y,Gtmp,train,gamma):
add = list()
rem = list()
colsums = np.sum(Y[train,:],axis=0);
cutoff = math.ceil(np.percentile(colsums,prc_cut));
for i in range(len(colsums)):
if colsums[i] == 0:
pass
elif colsums[i] < wt_cutoff(i,cutoff,Gtmp,gamma):
idx = np.squeeze(np.array(np.where(Y[train,i]>=1)));
choice = np.random.choice(train[idx],int(wt_cutoff(i,cutoff,Gtmp,gamma)-colsums[i]))
add = add + choice.tolist();
elif colsums[i] == wt_cutoff(i,cutoff,Gtmp,gamma):
pass
else:
idx = np.squeeze(np.array(np.where(Y[train,i]>=1)));
choice = np.random.choice(train[idx],int(colsums[i]-wt_cutoff(i,cutoff,Gtmp,gamma)),replace=False)
rem = rem + choice.tolist()
return np.concatenate((list([val for val in train if val not in rem]),add));
def select_feats(Xtmp,num_zero_prc_cut,var_prc_cut):
#*********************************************************************
# remove features with many zeros
num_feat_zeros = np.sum(Xtmp==0,axis=1);
Xtmp = Xtmp[num_feat_zeros<num_zero_prc_cut*Xtmp.shape[1],:]
#*********************************************************************
# remove features with low variance
feat_vars = np.var(Xtmp,axis=1)
Xtmp = Xtmp[feat_vars>np.percentile(feat_vars,var_prc_cut),:]
return(Xtmp)
def get_yn(predict,ys,delta,tau,output_feats):
D = tf.cast(Dnp, tf.float32);
G = tf.cast(Gnp, tf.float32);
ys = tf.cast(ys, tf.float32);
#print("start")
Cm = tf.matmul(tf.transpose(tf.matmul(ys,D)),predict+0.1)/tf.reshape(tf.reduce_sum(tf.transpose(tf.matmul(ys,D)),1),(-1,1));
#print("1")
mCm = tf.reshape(tf.reduce_mean(tf.cast(tf.matmul(tf.transpose(D),G)>0,tf.float32)*Cm,1),(-1,1));
#print("2")
yw = tf.multiply(predict+0.1,tf.matmul(tf.matmul(ys,D),tf.pow(mCm/Cm,tau)));
#print("3")
ye = tf.multiply(tf.matmul(ys,G),yw);
#print("4")
yt = tf.matmul(ys,tf.matmul(tf.transpose(ys),ye));
#print("5")
ya = (delta*yt)+((1-delta)*ye)
#print("6")
yn = tf.cast(tf.one_hot(tf.argmax(ya,axis=1),output_feats), dtype=tf.float32)
#print("7")
return(yn)
def get_yi(rowsums,G2,ys):
G2 = tf.cast(G2, tf.float32);
ys = tf.cast(ys, tf.float32);
yi = tf.cast(tf.matmul(ys,G2), dtype=tf.float32);
return(yi)
def run_LAmbDA2(gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes):
global X, Y, Gnp, Dnp, train, test, prt, cv
D = tf.cast(Dnp, tf.float32);
G = tf.cast(Gnp, tf.float32);
#optunity_it = optunity_it+1;
num_trees = int(num_trees);
max_nodes = int(max_nodes);
prc_cut = int(np.ceil(prc_cut));
print("gamma=%.4f, delta=%.4f, tau=%.4f, prc_cut=%i, bs_prc=%.4f, num_trees=%i, max_nodes=%i" % (gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes))
input_feats = X.shape[1];
num_labls = G.shape.as_list();
output_feats = num_labls[1];
#print(output_feats)
num_labls = num_labls[0];
rowsums = np.sum(Gnp,axis=1);
train2 = resample(prc_cut, Y, Gnp, train, gamma); # Bug??
bs = int(np.ceil(bs_prc*train2.size))
xs = tf.placeholder(tf.float32, [None,input_feats])
#ys = tf.placeholder(tf.float32, [None,num_labls])
yin = tf.placeholder(tf.int32, [None])
print("Vars loaded xs and ys created")
hparams = tensor_forest.ForestHParams(num_classes=output_feats,
num_features=input_feats,
num_trees=num_trees,
max_nodes=max_nodes).fill()
print("Tensor forest hparams created")
forest_graph = tensor_forest.RandomForestGraphs(hparams)
print("Tensor forest graph created")
train_op = forest_graph.training_graph(xs, yin)
loss_op = forest_graph.training_loss(xs, yin)
print("Loss and train ops created")
predict, _, _ = forest_graph.inference_graph(xs)
print("Tensor forest variables created through predict")
accuracy_op = tf.reduce_mean(tf.reduce_sum(tf.square(tf.one_hot(yin,output_feats)-predict),reduction_indices=[1]))
print(tf.reduce_sum(tf.square(tf.one_hot(yin,output_feats)-predict),reduction_indices=[1]))
#predict = tf.one_hot(pred);
print("Lambda specific variables created")
# Creating training and testing steps
G2 = np.copy(Gnp);
G2[rowsums>1,:] = 0;
YI = np.matmul(Y,G2);
YIrs = np.sum(YI,axis=1);
trainI = train2[np.in1d(train2,np.where(YIrs==1))];
print("data type trainI,",trainI.dtype)
testI = test[np.in1d(test,np.where(YIrs==1))];
print("trainI testI created")
#init_vars=tf.global_variables_initializer()
init_vars = tf.group(tf.global_variables_initializer(),
resources.initialize_resources(resources.shared_resources()))
sess = tf.Session()
sess.run(init_vars)
print("Session started")
#beep = sess.run(predict,feed_dict={xs:X[1:100,:]});
#beep = sess.run(predict,feed_dict={xs:X[train2[0:bs],:]});
tensor_trainI = {xs: X[trainI, :], yin: sess.run(tf.argmax(get_yi(rowsums,G2,Y[trainI, :]),axis=1))}
print("tensor_trainI made")
tensor_testI = {xs: X[testI, :], yin: sess.run(tf.argmax(get_yi(rowsums,G2,Y[testI, :]),axis=1))}
print("tensor_testI made")
tensor_train = {xs: X[train2[0:bs], :], yin: sess.run(tf.argmax(get_yn(sess.run(predict,feed_dict={xs:X[train2[0:bs],:]}),Y[train2[0:bs], :],delta,tau,output_feats),axis=1))}
print("tensor_train made")
tensor_test = {xs: X[test, :], yin: sess.run(tf.argmax(get_yn(sess.run(predict,feed_dict={xs:X[test,:]}),Y[test, :],delta,tau,output_feats),axis=1))}
print("tensor_test made")
#**********************************
#print("Loss and training steps created with sample tensors")
# Setting params and initializing
print("Beginning iterations")
# Starting training iterations
print(X.shape)
for i in range(1,101):
if i < 50:
sess.run(train_op, feed_dict=tensor_trainI)
#print("ran train op")
if i % 10 == 0:
print(str(sess.run(accuracy_op, feed_dict=tensor_trainI)) + ' ' + str(sess.run(accuracy_op, feed_dict=tensor_testI)))
else:
sess.run(train_op, feed_dict=tensor_train)
if i % 10 == 0:
print(str(sess.run(accuracy_op, feed_dict=tensor_train)) + ' ' + str(sess.run(accuracy_op, feed_dict=tensor_test)))
elif i % 10 == 0:
np.random_shuffle(train2);
tensor_train = {xs: X[train2[0:bs], :], yin: sess.run(get_yn(sess.run(predict,feed_dict={xs:X[train2[0:bs],:]}),Y[train2[0:bs], :],delta,tau,output_feats))}
if prt:
blah = sess.run(predict, feed_dict=tensor_test);
sio.savemat('preds_cv' + str(cv) + '.mat', {'preds': blah});
sio.savemat('truth_cv' + str(cv) + '.mat', {'labels': Y[test, :]});
acc = sess.run(accuracy_op, feed_dict=tensor_test)
print("loss1=%.4f, gamma=%.4f, delta=%.4f, tau=%.4f, prc_cut=%i, bs_prc=%.4f, num_trees=%i, max_nodes=%i" % (acc, gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes))
tf.reset_default_graph();
return(acc)
================================================
FILE: Scripts/run_LDA.py
================================================
import os
import numpy as np
import pandas as pd
import time as tm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import rpy2.robjects as robjects
def run_LDA(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run baseline classifier: LDA
Wrapper script to run LDA classifier on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
labels = labels.iloc[tokeep]
data = data.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
# folder with results
os.chdir(OutputDir)
# normalize data
data = np.log1p(data)
Classifier = LinearDiscriminantAnalysis()
tr_time=[]
ts_time=[]
truelab = []
pred = []
for i in range(np.squeeze(nfolds)):
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
train=data.iloc[train_ind_i]
test=data.iloc[test_ind_i]
y_train=labels.iloc[train_ind_i]
y_test=labels.iloc[test_ind_i]
if (NumGenes > 0):
feat_to_use = features.iloc[0:NumGenes,i]
train = train.iloc[:,feat_to_use]
test = test.iloc[:,feat_to_use]
start=tm.time()
Classifier.fit(train, y_train)
tr_time.append(tm.time()-start)
start=tm.time()
predicted = Classifier.predict(test)
ts_time.append(tm.time()-start)
truelab.extend(y_test.values)
pred.extend(predicted)
truelab = pd.DataFrame(truelab)
pred = pd.DataFrame(pred)
tr_time = pd.DataFrame(tr_time)
ts_time = pd.DataFrame(ts_time)
if (NumGenes == 0):
truelab.to_csv("LDA_True_Labels.csv", index = False)
pred.to_csv("LDA_Pred_Labels.csv", index = False)
tr_time.to_csv("LDA_Training_Time.csv", index = False)
ts_time.to_csv("LDA_Testing_Time.csv", index = False)
else:
truelab.to_csv("LDA_" + str(NumGenes) + "_True_Labels.csv", index = False)
pred.to_csv("LDA_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
tr_time.to_csv("LDA_" + str(NumGenes) + "_Training_Time.csv", index = False)
ts_time.to_csv("LDA_" + str(NumGenes) + "_Testing_Time.csv", index = False)
================================================
FILE: Scripts/run_LDA_rejection.py
================================================
import os
import numpy as np
import pandas as pd
import time as tm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import rpy2.robjects as robjects
def run_LDA_rejection(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0, Threshold = 0.7):
'''
run baseline classifier: LDA
Wrapper script to run LDA classifier on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
Threshold : Threshold used when rejecting the genes, default is 0.7.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
labels = labels.iloc[tokeep]
data = data.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
# folder with results
os.chdir(OutputDir)
# normalize data
data = np.log1p(data)
Classifier = LinearDiscriminantAnalysis()
tr_time=[]
ts_time=[]
truelab = []
pred = []
for i in range(np.squeeze(nfolds)):
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
train=data.iloc[train_ind_i]
test=data.iloc[test_ind_i]
y_train=labels.iloc[train_ind_i]
y_test=labels.iloc[test_ind_i]
if (NumGenes > 0):
feat_to_use = features.iloc[0:NumGenes,i]
train = train.iloc[:,feat_to_use]
test = test.iloc[:,feat_to_use]
start=tm.time()
Classifier.fit(train, y_train)
tr_time.append(tm.time()-start)
start=tm.time()
predicted = Classifier.predict(test)
prob = np.max(Classifier.predict_proba(test), axis = 1)
unlabeled = np.where(prob < Threshold)
predicted[unlabeled] = 'Unknown'
ts_time.append(tm.time()-start)
truelab.extend(y_test.values)
pred.extend(predicted)
truelab = pd.DataFrame(truelab)
pred = pd.DataFrame(pred)
tr_time = pd.DataFrame(tr_time)
ts_time = pd.DataFrame(ts_time)
if (NumGenes == 0):
truelab.to_csv("LDA_True_Labels.csv", index = False)
pred.to_csv("LDA_Pred_Labels.csv", index = False)
tr_time.to_csv("LDA_Training_Time.csv", index = False)
ts_time.to_csv("LDA_Testing_Time.csv", index = False)
else:
truelab.to_csv("LDA_" + str(NumGenes) + "_True_Labels.csv", index = False)
pred.to_csv("LDA_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
tr_time.to_csv("LDA_" + str(NumGenes) + "_Training_Time.csv", index = False)
ts_time.to_csv("LDA_" + str(NumGenes) + "_Testing_Time.csv", index = False)
================================================
FILE: Scripts/run_NMC.py
================================================
import os
import numpy as np
import pandas as pd
import time as tm
from sklearn.neighbors import NearestCentroid
import rpy2.robjects as robjects
def run_NMC(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run baseline classifier: NMC
Wrapper script to run a NMC classifier on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
labels = labels.iloc[tokeep]
data = data.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
# folder with results
os.chdir(OutputDir)
# normalize data
data = np.log1p(data)
Classifier = NearestCentroid()
tr_time=[]
ts_time=[]
truelab = []
pred = []
for i in range(np.squeeze(nfolds)):
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
train=data.iloc[train_ind_i]
test=data.iloc[test_ind_i]
y_train=labels.iloc[train_ind_i]
y_test=labels.iloc[test_ind_i]
if (NumGenes > 0):
feat_to_use = features.iloc[0:NumGenes,i]
train = train.iloc[:,feat_to_use]
test = test.iloc[:,feat_to_use]
start=tm.time()
Classifier.fit(train, y_train)
tr_time.append(tm.time()-start)
start=tm.time()
predicted = Classifier.predict(test)
ts_time.append(tm.time()-start)
truelab.extend(y_test.values)
pred.extend(predicted)
truelab = pd.DataFrame(truelab)
pred = pd.DataFrame(pred)
tr_time = pd.DataFrame(tr_time)
ts_time = pd.DataFrame(ts_time)
if (NumGenes == 0):
truelab.to_csv("NMC_True_Labels.csv", index = False)
pred.to_csv("NMC_Pred_Labels.csv", index = False)
tr_time.to_csv("NMC_Training_Time.csv", index = False)
ts_time.to_csv("NMC_Testing_Time.csv", index = False)
else:
truelab.to_csv("NMC_" + str(NumGenes) + "_True_Labels.csv", index = False)
pred.to_csv("NMC_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
tr_time.to_csv("NMC_" + str(NumGenes) + "_Training_Time.csv", index = False)
ts_time.to_csv("NMC_" + str(NumGenes) + "_Testing_Time.csv", index = False)
================================================
FILE: Scripts/run_RF.py
================================================
import os
import numpy as np
import pandas as pd
import time as tm
from sklearn.ensemble import RandomForestClassifier
import rpy2.robjects as robjects
def run_RF(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run baseline classifier: RF
Wrapper script to run a RF classifier with 50 trees on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
labels = labels.iloc[tokeep]
data = data.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
# folder with results
os.chdir(OutputDir)
# normalize data
data = np.log1p(data)
Classifier = RandomForestClassifier(n_estimators = 50)
tr_time=[]
ts_time=[]
truelab = []
pred = []
for i in range(np.squeeze(nfolds)):
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
train=data.iloc[train_ind_i]
test=data.iloc[test_ind_i]
y_train=labels.iloc[train_ind_i]
y_test=labels.iloc[test_ind_i]
if (NumGenes > 0):
feat_to_use = features.iloc[0:NumGenes,i]
train = train.iloc[:,feat_to_use]
test = test.iloc[:,feat_to_use]
start=tm.time()
Classifier.fit(train, y_train)
tr_time.append(tm.time()-start)
start=tm.time()
predicted = Classifier.predict(test)
ts_time.append(tm.time()-start)
truelab.extend(y_test.values)
pred.extend(predicted)
truelab = pd.DataFrame(truelab)
pred = pd.DataFrame(pred)
tr_time = pd.DataFrame(tr_time)
ts_time = pd.DataFrame(ts_time)
if (NumGenes == 0):
truelab.to_csv("RF_True_Labels.csv", index = False)
pred.to_csv("RF_Pred_Labels.csv", index = False)
tr_time.to_csv("RF_Training_Time.csv", index = False)
ts_time.to_csv("RF_Testing_Time.csv", index = False)
else:
truelab.to_csv("RF_" + str(NumGenes) + "_True_Labels.csv", index = False)
pred.to_csv("RF_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
tr_time.to_csv("RF_" + str(NumGenes) + "_Training_Time.csv", index = False)
ts_time.to_csv("RF_" + str(NumGenes) + "_Testing_Time.csv", index = False)
================================================
FILE: Scripts/run_SCINA.R
================================================
run_SCINA<-function(DataPath,LabelsPath,GeneSigPath,OutputDir){
"
run SCINA
Wrapper script to run SCINA on a benchmark dataset,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
GeneSigPath : Cell type marker genes file path (.csv)
OutputDir : Output directory defining the path of the exported file.
"
Data <- read.csv(DataPath,row.names = 1)
Labels <- as.vector(as.matrix(read.csv(LabelsPath)))
Data <- Data[is.element(Labels,c('CD14+ Monocyte','CD19+ B','CD56+ NK')),]
Labels <- Labels[is.element(Labels,c('CD14+ Monocyte','CD19+ B','CD56+ NK'))]
Labels[Labels == 'CD14+ Monocyte'] <- 'CD14_Monocyte'
Labels[Labels == 'CD19+ B'] <- 'CD19_B'
Labels[Labels == 'CD56+ NK'] <- 'CD56_NK'
#############################################################################
# SCINA #
#############################################################################
library(SCINA)
Signature_Genes <- preprocess.signatures(GeneSigPath)
True_Labels_SCINA <- list()
Pred_Labels_SCINA <- list()
Total_Time_SCINA <- list()
library(preprocessCore)
Data = t(as.matrix(Data))
Data=log(Data+1)
Data[]=normalize.quantiles(Data)
start_time <- Sys.time()
results = SCINA(Data, Signature_Genes)
end_time <- Sys.time()
True_Labels_SCINA <- Labels
Pred_Labels_SCINA <- results$cell_labels
Total_Time_SCINA <- as.numeric(difftime(end_time,start_time,units = 'secs'))
setwd(OutputDir)
write.csv(True_Labels_SCINA,'SCINA_True_Labels.csv',row.names = FALSE)
write.csv(Pred_Labels_SCINA,'SCINA_Pred_Labels.csv',row.names = FALSE)
write.csv(Total_Time_SCINA,'SCINA_Total_Time.csv',row.names = FALSE)
}
================================================
FILE: Scripts/run_SVM.py
================================================
import os
import numpy as np
import pandas as pd
import time as tm
from sklearn.svm import LinearSVC
import rpy2.robjects as robjects
def run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run baseline classifier: SVM
Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
labels = labels.iloc[tokeep]
data = data.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
# folder with results
os.chdir(OutputDir)
# normalize data
data = np.log1p(data)
Classifier = LinearSVC()
tr_time=[]
ts_time=[]
truelab = []
pred = []
for i in range(np.squeeze(nfolds)):
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
train=data.iloc[train_ind_i]
test=data.iloc[test_ind_i]
y_train=labels.iloc[train_ind_i]
y_test=labels.iloc[test_ind_i]
if (NumGenes > 0):
feat_to_use = features.iloc[0:NumGenes,i]
train = train.iloc[:,feat_to_use]
test = test.iloc[:,feat_to_use]
start=tm.time()
Classifier.fit(train, y_train)
tr_time.append(tm.time()-start)
start=tm.time()
predicted = Classifier.predict(test)
ts_time.append(tm.time()-start)
truelab.extend(y_test.values)
pred.extend(predicted)
truelab = pd.DataFrame(truelab)
pred = pd.DataFrame(pred)
tr_time = pd.DataFrame(tr_time)
ts_time = pd.DataFrame(ts_time)
if (NumGenes == 0):
truelab.to_csv("SVM_True_Labels.csv", index = False)
pred.to_csv("SVM_Pred_Labels.csv", index = False)
tr_time.to_csv("SVM_Training_Time.csv", index = False)
ts_time.to_csv("SVM_Testing_Time.csv", index = False)
else:
truelab.to_csv("SVM_" + str(NumGenes) + "_True_Labels.csv", index = False)
pred.to_csv("SVM_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
tr_time.to_csv("SVM_" + str(NumGenes) + "_Training_Time.csv", index = False)
ts_time.to_csv("SVM_" + str(NumGenes) + "_Testing_Time.csv", index = False)
================================================
FILE: Scripts/run_SVM_rejection.py
================================================
import os
import numpy as np
import pandas as pd
import time as tm
from sklearn.svm import LinearSVC
import rpy2.robjects as robjects
from sklearn.calibration import CalibratedClassifierCV
def run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0, Threshold = 0.7):
'''
run baseline classifier: SVM
Wrapper script to run an SVM classifier with a linear kernel on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
Threshold : Threshold used when rejecting the cells, default is 0.7.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
labels = labels.iloc[tokeep]
data = data.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
# folder with results
os.chdir(OutputDir)
# normalize data
data = np.log1p(data)
Classifier = LinearSVC()
clf = CalibratedClassifierCV(Classifier)
tr_time=[]
ts_time=[]
truelab = []
pred = []
for i in range(np.squeeze(nfolds)):
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
train=data.iloc[train_ind_i]
test=data.iloc[test_ind_i]
y_train=labels.iloc[train_ind_i]
y_test=labels.iloc[test_ind_i]
if (NumGenes > 0):
feat_to_use = features.iloc[0:NumGenes,i]
train = train.iloc[:,feat_to_use]
test = test.iloc[:,feat_to_use]
start=tm.time()
clf.fit(train, y_train)
tr_time.append(tm.time()-start)
start=tm.time()
predicted = clf.predict(test)
prob = np.max(clf.predict_proba(test), axis = 1)
unlabeled = np.where(prob < Threshold)
predicted[unlabeled] = 'Unknown'
ts_time.append(tm.time()-start)
truelab.extend(y_test.values)
pred.extend(predicted)
truelab = pd.DataFrame(truelab)
pred = pd.DataFrame(pred)
tr_time = pd.DataFrame(tr_time)
ts_time = pd.DataFrame(ts_time)
if (NumGenes == 0):
truelab.to_csv("SVM_True_Labels.csv", index = False)
pred.to_csv("SVM_Pred_Labels.csv", index = False)
tr_time.to_csv("SVM_Training_Time.csv", index = False)
ts_time.to_csv("SVM_Testing_Time.csv", index = False)
else:
truelab.to_csv("SVM_" + str(NumGenes) + "_True_Labels.csv", index = False)
pred.to_csv("SVM_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
tr_time.to_csv("SVM_" + str(NumGenes) + "_Training_Time.csv", index = False)
ts_time.to_csv("SVM_" + str(NumGenes) + "_Testing_Time.csv", index = False)
================================================
FILE: Scripts/run_SingleR.R
================================================
run_SingleR<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
"
run SingleR
Wrapper script to run SingleR on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
"
Data <- read.csv(DataPath,row.names = 1)
Labels <- as.matrix(read.csv(LabelsPath))
load(CV_RDataPath)
Labels <- as.vector(Labels[,col_Index])
Data <- Data[Cells_to_Keep,]
Labels <- Labels[Cells_to_Keep]
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
GenesOrder = read.csv(GeneOrderPath)
}
#############################################################################
# SingleR #
#############################################################################
library(SingleR)
library(Seurat)
True_Labels_SingleR <- list()
Pred_Labels_SingleR <- list()
Total_Time_SingleR <- list()
Data = t(as.matrix(Data))
for (i in c(1:n_folds)){
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
start_time <- Sys.time()
singler = SingleR(method = "single", Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]],
Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]],
Labels[Train_Idx[[i]]], numCores = 1)
end_time <- Sys.time()
}
else{
start_time <- Sys.time()
singler = SingleR(method = "single", Data[,Test_Idx[[i]]], Data[,Train_Idx[[i]]], Labels[Train_Idx[[i]]], numCores = 1)
end_time <- Sys.time()
}
Total_Time_SingleR[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
True_Labels_SingleR[i] <- list(Labels[Test_Idx[[i]]])
Pred_Labels_SingleR[i] <- list(as.vector(singler$labels))
}
True_Labels_SingleR <- as.vector(unlist(True_Labels_SingleR))
Pred_Labels_SingleR <- as.vector(unlist(Pred_Labels_SingleR))
Total_Time_SingleR <- as.vector(unlist(Total_Time_SingleR))
setwd(OutputDir)
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
write.csv(True_Labels_SingleR,paste('SingleR_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
write.csv(Pred_Labels_SingleR,paste('SingleR_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
write.csv(Total_Time_SingleR,paste('SingleR_',NumGenes,'_Total_Time.csv', sep = ''),row.names = FALSE)
}
else{
write.csv(True_Labels_SingleR,'SingleR_True_Labels.csv',row.names = FALSE)
write.csv(Pred_Labels_SingleR,'SingleR_Pred_Labels.csv',row.names = FALSE)
write.csv(Total_Time_SingleR,'SingleR_Total_Time.csv',row.names = FALSE)
}
}
================================================
FILE: Scripts/run_kNN50.py
================================================
import os
import numpy as np
import pandas as pd
import time as tm
from sklearn.neighbors import KNeighborsClassifier
import rpy2.robjects as robjects
def run_kNN50(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run baseline classifiers: kNN
Wrapper script to run kNN (with k = 50) classifier on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
labels = labels.iloc[tokeep]
data = data.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
# folder with results
os.chdir(OutputDir)
# normalize data
data = np.log1p(data)
Classifier = KNeighborsClassifier(n_neighbors=50)
tr_time=[]
ts_time=[]
truelab = []
pred = []
for i in range(np.squeeze(nfolds)):
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
train=data.iloc[train_ind_i]
test=data.iloc[test_ind_i]
y_train=labels.iloc[train_ind_i]
y_test=labels.iloc[test_ind_i]
if (NumGenes > 0):
feat_to_use = features.iloc[0:NumGenes,i]
train = train.iloc[:,feat_to_use]
test = test.iloc[:,feat_to_use]
start=tm.time()
Classifier.fit(train, y_train)
tr_time.append(tm.time()-start)
start=tm.time()
predicted = Classifier.predict(test)
ts_time.append(tm.time()-start)
truelab.extend(y_test.values)
pred.extend(predicted)
truelab = pd.DataFrame(truelab)
pred = pd.DataFrame(pred)
tr_time = pd.DataFrame(tr_time)
ts_time = pd.DataFrame(ts_time)
if (NumGenes == 0):
truelab.to_csv("kNN50_True_Labels.csv", index = False)
pred.to_csv("kNN50_Pred_Labels.csv", index = False)
tr_time.to_csv("kNN50_Training_Time.csv", index = False)
ts_time.to_csv("kNN50_Testing_Time.csv", index = False)
else:
truelab.to_csv("kNN50_" + str(NumGenes) + "_True_Labels.csv", index = False)
pred.to_csv("kNN50_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
tr_time.to_csv("kNN50_" + str(NumGenes) + "_Training_Time.csv", index = False)
ts_time.to_csv("kNN50_" + str(NumGenes) + "_Testing_Time.csv", index = False)
================================================
FILE: Scripts/run_kNN9.py
================================================
import os
import numpy as np
import pandas as pd
import time as tm
from sklearn.neighbors import KNeighborsClassifier
import rpy2.robjects as robjects
def run_kNN9(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run baseline classifiers: kNN
Wrapper script to run kNN (with k = 9) classifier on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
labels = labels.iloc[tokeep]
data = data.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
# folder with results
os.chdir(OutputDir)
# normalize data
data = np.log1p(data)
Classifier = KNeighborsClassifier(n_neighbors=9)
tr_time=[]
ts_time=[]
truelab = []
pred = []
for i in range(np.squeeze(nfolds)):
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
train=data.iloc[train_ind_i]
test=data.iloc[test_ind_i]
y_train=labels.iloc[train_ind_i]
y_test=labels.iloc[test_ind_i]
if (NumGenes > 0):
feat_to_use = features.iloc[0:NumGenes,i]
train = train.iloc[:,feat_to_use]
test = test.iloc[:,feat_to_use]
start=tm.time()
Classifier.fit(train, y_train)
tr_time.append(tm.time()-start)
start=tm.time()
predicted = Classifier.predict(test)
ts_time.append(tm.time()-start)
truelab.extend(y_test.values)
pred.extend(predicted)
truelab = pd.DataFrame(truelab)
pred = pd.DataFrame(pred)
tr_time = pd.DataFrame(tr_time)
ts_time = pd.DataFrame(ts_time)
if (NumGenes == 0):
truelab.to_csv("kNN9_True_Labels.csv", index = False)
pred.to_csv("kNN9_Pred_Labels.csv", index = False)
tr_time.to_csv("kNN9_Training_Time.csv", index = False)
ts_time.to_csv("kNN9_Testing_Time.csv", index = False)
else:
truelab.to_csv("kNN9_" + str(NumGenes) + "_True_Labels.csv", index = False)
pred.to_csv("kNN9_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
tr_time.to_csv("kNN9_" + str(NumGenes) + "_Training_Time.csv", index = False)
ts_time.to_csv("kNN9_" + str(NumGenes) + "_Testing_Time.csv", index = False)
================================================
FILE: Scripts/run_moana.py
================================================
import os
import pandas as pd
import numpy as np
from moana.core import ExpMatrix
from moana.classify import CellTypeClassifier
import time as tm
import rpy2.robjects as robjects
def run_moana(DataPath, LabelsPath, ClassifierPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run moana
Wrapper script to run moana on a benchmark dataset with a pretrained classifier,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
ClassifierPath : Data file path to the pretrained classifier.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# # read the Rdata file
# robjects.r['load'](CV_RDataPath)
#
# tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
# col = np.array(robjects.r['col_Index'], dtype = 'int')
# col = col - 1
matrix = ExpMatrix.read_tsv(DataPath, sep = ',')
# matrix = matrix.iloc[tokeep]
truelab = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',')
# truelab = truelab.iloc[tokeep]
ct_old = ['CD19+ B','CD14+ Monocyte','CD4+/CD45RA+/CD25- Naive T','CD4+/CD45RO+ Memory','CD8+/CD45RA+ Naive Cytotoxic','Dendritic', 'CD56+ NK']
ct_new = ['B cells','CD14+ monocytes','Naive CD4+ T cells','Memory CD4+ T cells','Naive CD8+ T cells','Dendritic cells','NK cells']
tokeep2 = np.isin(truelab,ct_old)
truelab = truelab[tokeep2]
print(len(truelab))
matrix = matrix.iloc[np.squeeze(tokeep2)]
for i in range(len(ct_old)):
truelab.iloc[truelab == ct_old[i]] = ct_new[i]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
feat_to_use = features.iloc[0:NumGenes,0]
matrix = matrix.iloc[:,feat_to_use]
data = ExpMatrix(X = np.transpose(matrix.X), genes = matrix.cells, cells = matrix.genes)
data.genes.name = 'Genes'
data.cells.name = 'Cells'
data.index.name = 'Genes'
data.columns.name = 'Cells'
clf = CellTypeClassifier.read_pickle(ClassifierPath)
start = tm.time()
predictions = clf.predict(data)
runtime = tm.time() - start
np.asarray(predictions)
pred = pd.DataFrame(predictions)
os.chdir(OutputDir)
if (NumGenes == 0):
truelab.to_csv("moana_True_Labels.csv", index = False)
pred.to_csv("moana_Pred_Labels.csv", index = False)
with open("moana_Total_Time.csv", 'w') as f:
f.write("%f\n" % runtime)
else:
truelab.to_csv("moana_" + str(NumGenes) + "_True_Labels.csv", index = False)
pred.to_csv("moana_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
with open("moana_" + str(NumGenes) + "_Total_Time.csv", 'w') as f:
f.write("%f\n" % runtime)
================================================
FILE: Scripts/run_scID.R
================================================
run_scID<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
"
run scID
Wrapper script to run scID on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
"
Data <- read.csv(DataPath,row.names = 1)
Labels <- as.matrix(read.csv(LabelsPath))
load(CV_RDataPath)
Labels <- as.vector(Labels[,col_Index])
Data <- Data[Cells_to_Keep,]
Labels <- Labels[Cells_to_Keep]
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
GenesOrder = read.csv(GeneOrderPath)
}
#############################################################################
# scID #
#############################################################################
library(scID)
library(Seurat)
True_Labels_scID <- list()
Pred_Labels_scID <- list()
Total_Time_scID <- list()
Data = t(as.matrix(Data))
for (i in c(1:n_folds)){
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
Train_Labels <- list(Labels[Train_Idx[[i]]])
names(Train_Labels[[1]]) <- colnames(Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]])
start_time <- Sys.time()
scID_output <- scid_multiclass(Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]],
Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]],
Train_Labels[[1]])
end_time <- Sys.time()
}
else{
Train_Labels <- list(Labels[Train_Idx[[i]]])
names(Train_Labels[[1]]) <- colnames(Data[,Train_Idx[[i]]])
start_time <- Sys.time()
scID_output <- scid_multiclass(Data[,Test_Idx[[i]]], Data[,Train_Idx[[i]]], Train_Labels[[1]])
end_time <- Sys.time()
}
Total_Time_scID[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
True_Labels_scID[i] <- list(Labels[Test_Idx[[i]]])
Pred_Labels_scID[i] <- list(as.vector(scID_output$labels))
}
True_Labels_scID <- as.vector(unlist(True_Labels_scID))
Pred_Labels_scID <- as.vector(unlist(Pred_Labels_scID))
Total_Time_scID <- as.vector(unlist(Total_Time_scID))
setwd(OutputDir)
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
write.csv(True_Labels_scID,paste('scID_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
write.csv(Pred_Labels_scID,paste('scID_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
write.csv(Total_Time_scID,paste('scID_',NumGenes,'_Total_Time.csv', sep = ''),row.names = FALSE)
}
else{
write.csv(True_Labels_scID,'scID_True_Labels.csv',row.names = FALSE)
write.csv(Pred_Labels_scID,'scID_Pred_Labels.csv',row.names = FALSE)
write.csv(Total_Time_scID,'scID_Total_Time.csv',row.names = FALSE)
}
}
================================================
FILE: Scripts/run_scPred.R
================================================
run_scPred<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
"
run scPred
Wrapper script to run scPred on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
"
Data <- read.csv(DataPath,row.names = 1)
Labels <- as.matrix(read.csv(LabelsPath))
load(CV_RDataPath)
Labels <- as.vector(Labels[,col_Index])
Data <- Data[Cells_to_Keep,]
Labels <- Labels[Cells_to_Keep]
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
GenesOrder = read.csv(GeneOrderPath)
}
#############################################################################
# scPred #
#############################################################################
library(scPred)
library(tidyverse)
library(SingleCellExperiment)
True_Labels_scPred <- list()
Pred_Labels_scPred <- list()
Training_Time_scPred <- list()
Testing_Time_scPred <- list()
Data = t(as.matrix(Data))
for (i in c(1:n_folds)){
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]),
colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
sce_counts <- normcounts(sce)
sce_cpm <- apply(sce_counts, 2, function(x) (x/sum(x))*1000000)
sce_metadata <- as.data.frame(colData(sce))
sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]),
colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
sce_counts_test <- normcounts(sce_test)
sce_cpm_test <- apply(sce_counts_test, 2, function(x) (x/sum(x))*1000000)
sce_metadata_test <- as.data.frame(colData(sce_test))
}
else{
sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]),
colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
sce_counts <- normcounts(sce)
sce_cpm <- apply(sce_counts, 2, function(x) (x/sum(x))*1000000)
sce_metadata <- as.data.frame(colData(sce))
sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]),
colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
sce_counts_test <- normcounts(sce_test)
sce_cpm_test <- apply(sce_counts_test, 2, function(x) (x/sum(x))*1000000)
sce_metadata_test <- as.data.frame(colData(sce_test))
}
# scPred Training
start_time <- Sys.time()
set.seed(1234)
scp <- eigenDecompose(sce_cpm)
scPred::metadata(scp) <- sce_metadata
scp <- getFeatureSpace(scp, pVar = 'cell_type1')
# plotEigen(scp, group = 'cell_type1')
scp <- trainModel(scp)
# plotTrainProbs(scp)
end_time <- Sys.time()
Training_Time_scPred[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
# scPred Prediction
start_time <- Sys.time()
scp <- scPredict(scp,newData = sce_cpm_test)
end_time <- Sys.time()
Testing_Time_scPred[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
True_Labels_scPred[i] <- list(Labels[Test_Idx[[i]]])
Pred_Labels_scPred[i] <- list(getPredictions(scp)$predClass)
}
True_Labels_scPred <- as.vector(unlist(True_Labels_scPred))
Pred_Labels_scPred <- as.vector(unlist(Pred_Labels_scPred))
Training_Time_scPred <- as.vector(unlist(Training_Time_scPred))
Testing_Time_scPred <- as.vector(unlist(Testing_Time_scPred))
setwd(OutputDir)
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
write.csv(True_Labels_scPred,paste('scPred_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
write.csv(Pred_Labels_scPred,paste('scPred_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
write.csv(Training_Time_scPred,paste('scPred_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)
write.csv(Testing_Time_scPred,paste('scPred_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)
}
else{
write.csv(True_Labels_scPred,'scPred_True_Labels.csv',row.names = FALSE)
write.csv(Pred_Labels_scPred,'scPred_Pred_Labels.csv',row.names = FALSE)
write.csv(Training_Time_scPred,'scPred_Training_Time.csv',row.names = FALSE)
write.csv(Testing_Time_scPred,'scPred_Testing_Time.csv',row.names = FALSE)
}
}
================================================
FILE: Scripts/run_scVI.py
================================================
from scvi.dataset import CsvDataset
import os
import numpy as np
import pandas as pd
from scvi.models import SCANVI
from scvi.inference import SemiSupervisedTrainer
import time as tm
import rpy2.robjects as robjects
def run_scVI(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run scVI
Wrapper script to run scVI on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
labels = labels.iloc[tokeep]
data = data.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
os.chdir(OutputDir)
if (NumGenes == 0):
#save labels as csv file with header and index column
labels.to_csv('Labels_scvi.csv')
data.to_csv('Data_scvi.csv')
train = CsvDataset('Data_scvi.csv', save_path = "", sep = ",", labels_file = "Labels_scvi.csv", gene_by_cell = False)
## this semisupervised trainer automatically uses a part of the input data for training and a part for testing
scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels)
trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5)
n_epochs = 200
truelab = []
pred = []
tr_time = []
ts_time = []
for i in range(np.squeeze(nfolds)):
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
if (NumGenes > 0):
feat_to_use = features.iloc[0:NumGenes,i]
data2 = data.iloc[:,feat_to_use]
labels.to_csv('Labels_scvi.csv')
data2.to_csv('Data_scvi.csv')
train = CsvDataset('Data_scvi.csv', save_path = "", sep = ",", labels_file = "Labels_scvi.csv", gene_by_cell = False, new_n_genes = False)
## this semisupervised trainer automatically uses a part of the input data for training and a part for testing
scanvi = SCANVI(train.nb_genes, train.n_batches, train.n_labels)
trainer_scanvi = SemiSupervisedTrainer(scanvi, train, frequency=5)
trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(indices=(train_ind_i).ravel(), shuffle = False)
trainer_scanvi.labelled_set.to_monitor = ['ll','accuracy']
trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(indices=(test_ind_i).ravel(), shuffle = False)
trainer_scanvi.unlabelled_set.to_monitor = ['ll','accuracy']
start = tm.time()
trainer_scanvi.train(n_epochs)
tr_time.append(tm.time()-start)
## labels of test set are in y_pred
## labels are returned in numbers, should be mapped back to the real labels
## indices are permutated
start = tm.time()
y_true, y_pred = trainer_scanvi.unlabelled_set.compute_predictions()
ts_time.append(tm.time()-start)
truelab.extend(y_true)
pred.extend(y_pred)
#write results
truelab = pd.DataFrame(truelab)
pred = pd.DataFrame(pred)
tr_time = pd.DataFrame(tr_time)
ts_time = pd.DataFrame(ts_time)
if (NumGenes == 0):
truelab.to_csv("scVI_True_Labels.csv", index = False)
pred.to_csv("scVI_Pred_Labels.csv", index = False)
tr_time.to_csv("scVI_Training_Time.csv", index = False)
ts_time.to_csv("scVI_Testing_Time.csv", index = False)
else:
truelab.to_csv("scVI_" + str(NumGenes) + "_True_Labels.csv", index = False)
pred.to_csv("scVI_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
tr_time.to_csv("scVI_" + str(NumGenes) + "_Training_Time.csv", index = False)
ts_time.to_csv("scVI_" + str(NumGenes) + "_Testing_Time.csv", index = False)
================================================
FILE: Scripts/run_scmap.R
================================================
run_scmap <- function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
"
run scmap
Wrapper script to run scmap on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
"
Data <- read.csv(DataPath,row.names = 1)
Labels <- as.matrix(read.csv(LabelsPath))
load(CV_RDataPath)
Labels <- as.vector(Labels[,col_Index])
Data <- Data[Cells_to_Keep,]
Labels <- Labels[Cells_to_Keep]
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
GenesOrder = read.csv(GeneOrderPath)
}
#############################################################################
# scmap #
#############################################################################
library(scmap)
library(SingleCellExperiment)
True_Labels_scmapcluster <- list()
Pred_Labels_scmapcluster <- list()
True_Labels_scmapcell <- list()
Pred_Labels_scmapcell <- list()
Training_Time_scmapcluster <- list()
Testing_Time_scmapcluster <- list()
Training_Time_scmapcell <- list()
Testing_Time_scmapcell <- list()
Data = t(as.matrix(Data))
for (i in c(1:n_folds)){
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
sce <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]),
colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
logcounts(sce) <- log2(normcounts(sce) + 1)
# use gene names as feature symbols
rowData(sce)$feature_symbol <- rownames(sce)
sce <- selectFeatures(sce, n_features = NumGenes, suppress_plot = TRUE)
sce_test <- SingleCellExperiment(list(normcounts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]),
colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
logcounts(sce_test) <- log2(normcounts(sce_test) + 1)
rowData(sce_test)$feature_symbol <- rownames(sce_test)
sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData
}
else{
sce <- SingleCellExperiment(list(normcounts = Data[,Train_Idx[[i]]]),
colData = data.frame(cell_type1 = Labels[Train_Idx[[i]]]))
logcounts(sce) <- log2(normcounts(sce) + 1)
# use gene names as feature symbols
rowData(sce)$feature_symbol <- rownames(sce)
sce <- selectFeatures(sce, suppress_plot = TRUE)
sce_test <- SingleCellExperiment(list(normcounts = Data[,Test_Idx[[i]]]),
colData = data.frame(cell_type1 = Labels[Test_Idx[[i]]]))
logcounts(sce_test) <- log2(normcounts(sce_test) + 1)
rowData(sce_test)$feature_symbol <- rownames(sce_test)
sce_test@rowRanges@elementMetadata@listData = sce@rowRanges@elementMetadata@listData
}
# scmap-cluster
start_time <- Sys.time()
sce <- indexCluster(sce)
end_time <- Sys.time()
Training_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
start_time <- Sys.time()
scmapCluster_results <- scmapCluster(projection = sce_test,index_list = list(metadata(sce)$scmap_cluster_index))
end_time <- Sys.time()
Testing_Time_scmapcluster[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
True_Labels_scmapcluster[i] <- list(Labels[Test_Idx[[i]]])
Pred_Labels_scmapcluster[i] <- list(scmapCluster_results$combined_labs)
# scmap-cell
start_time <- Sys.time()
set.seed(1)
sce <- indexCell(sce)
end_time <- Sys.time()
Training_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
start_time <- Sys.time()
scmapCell_results <- scmapCell(sce_test,list(metadata(sce)$scmap_cell_index))
scmapCell_clusters <- scmapCell2Cluster(scmapCell_results,list(as.character(colData(sce)$cell_type1)))
end_time <- Sys.time()
Testing_Time_scmapcell[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
True_Labels_scmapcell[i] <- list(Labels[Test_Idx[[i]]])
Pred_Labels_scmapcell[i] <- list(scmapCell_clusters$combined_labs)
}
True_Labels_scmapcluster <- as.vector(unlist(True_Labels_scmapcluster))
Pred_Labels_scmapcluster <- as.vector(unlist(Pred_Labels_scmapcluster))
True_Labels_scmapcell <- as.vector(unlist(True_Labels_scmapcell))
Pred_Labels_scmapcell <- as.vector(unlist(Pred_Labels_scmapcell))
Training_Time_scmapcluster <- as.vector(unlist(Training_Time_scmapcluster))
Testing_Time_scmapcluster <- as.vector(unlist(Testing_Time_scmapcluster))
Training_Time_scmapcell <- as.vector(unlist(Training_Time_scmapcell))
Testing_Time_scmapcell <- as.vector(unlist(Testing_Time_scmapcell))
setwd(OutputDir)
if (!is.null(GeneOrderPath) & !is.null (NumGenes)){
write.csv(True_Labels_scmapcluster,paste('scmapcluster_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
write.csv(Pred_Labels_scmapcluster,paste('scmapcluster_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
write.csv(True_Labels_scmapcell,paste('scmapcell_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
write.csv(Pred_Labels_scmapcell,paste('scmapcell_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
write.csv(Training_Time_scmapcluster,paste('scmapcluster_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)
write.csv(Testing_Time_scmapcluster,paste('scmapcluster_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)
write.csv(Training_Time_scmapcell,paste('scmapcell_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)
write.csv(Testing_Time_scmapcell,paste('scmapcell_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)
}
else{
write.csv(True_Labels_scmapcluster,'scmapcluster_True_Labels.csv',row.names = FALSE)
write.csv(Pred_Labels_scmapcluster,'scmapcluster_Pred_Labels.csv',row.names = FALSE)
write.csv(True_Labels_scmapcell,'scmapcell_True_Labels.csv',row.names = FALSE)
write.csv(Pred_Labels_scmapcell,'scmapcell_Pred_Labels.csv',row.names = FALSE)
write.csv(Training_Time_scmapcluster,'scmapcluster_Training_Time.csv',row.names = FALSE)
write.csv(Testing_Time_scmapcluster,'scmapcluster_Testing_Time.csv',row.names = FALSE)
write.csv(Training_Time_scmapcell,'scmapcell_Training_Time.csv',row.names = FALSE)
write.csv(Testing_Time_scmapcell,'scmapcell_Testing_Time.csv',row.names = FALSE)
}
}
================================================
FILE: Scripts/run_singleCellNet.R
================================================
run_singleCellNet<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
"
run singleCellNet
Wrapper script to run singleCellNet on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
"
Data <- read.csv(DataPath,row.names = 1)
colnames(Data) <- gsub('_','.',colnames(Data), fixed = TRUE)
Labels <- as.matrix(read.csv(LabelsPath))
load(CV_RDataPath)
Labels <- as.vector(Labels[,col_Index])
Data <- Data[Cells_to_Keep,]
Labels <- Labels[Cells_to_Keep]
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
GenesOrder = read.csv(GeneOrderPath)
}
#############################################################################
# singleCellNet #
#############################################################################
library(singleCellNet)
library(dplyr)
True_Labels_singleCellNet <- list()
Pred_Labels_singleCellNet <- list()
Training_Time_singleCellNet <- list()
Testing_Time_singleCellNet <- list()
Data = t(as.matrix(Data)) # deals also with sparse matrix
for(i in c(1:n_folds)){
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
DataTrain <- Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]
DataTest <- Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]
}
else{
DataTrain <- Data[,Train_Idx[[i]]]
DataTest <- Data[,Test_Idx[[i]]]
}
start_time <- Sys.time()
cgenes2<-findClassyGenes(DataTrain, data.frame(Annotation = Labels[Train_Idx[[i]]]), "Annotation")
cgenesA<-cgenes2[['cgenes']]
grps<-cgenes2[['grps']]
DataTrain<-as.matrix(DataTrain[cgenesA,])
xpairs<-ptGetTop(DataTrain, grps, ncores = 1)
pdTrain<-query_transform(DataTrain[cgenesA, ], xpairs)
rf<-sc_makeClassifier(pdTrain[xpairs,], genes=xpairs, groups=grps)
end_time <- Sys.time()
Training_Time_singleCellNet[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
start_time <- Sys.time()
DataTest<-query_transform(DataTest[cgenesA,], xpairs)
classRes <-rf_classPredict(rf, DataTest)
end_time <- Sys.time()
Testing_Time_singleCellNet[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
True_Labels_singleCellNet[i] <- list(Labels[Test_Idx[[i]]])
Pred_Labels_singleCellNet[i] <- list((rownames(classRes)[apply(classRes,2,which.max)])[1:length(Test_Idx[[i]])])
}
True_Labels_singleCellNet <- as.vector(unlist(True_Labels_singleCellNet))
Pred_Labels_singleCellNet <- as.vector(unlist(Pred_Labels_singleCellNet))
Training_Time_singleCellNet <- as.vector(unlist(Training_Time_singleCellNet))
Testing_Time_singleCellNet <- as.vector(unlist(Testing_Time_singleCellNet))
setwd(OutputDir)
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
write.csv(True_Labels_singleCellNet,paste('singleCellNet_',NumGenes,'_True_Labels.csv', sep = ''),row.names = FALSE)
write.csv(Pred_Labels_singleCellNet,paste('singleCellNet_',NumGenes,'_Pred_Labels.csv', sep = ''),row.names = FALSE)
write.csv(Training_Time_singleCellNet,paste('singleCellNet_',NumGenes,'_Training_Time.csv', sep = ''),row.names = FALSE)
write.csv(Testing_Time_singleCellNet,paste('singleCellNet_',NumGenes,'_Testing_Time.csv', sep = ''),row.names = FALSE)
}
else{
write.csv(True_Labels_singleCellNet,'singleCellNet_True_Labels.csv',row.names = FALSE)
write.csv(Pred_Labels_singleCellNet,'singleCellNet_Pred_Labels.csv',row.names = FALSE)
write.csv(Training_Time_singleCellNet,'singleCellNet_Training_Time.csv',row.names = FALSE)
write.csv(Testing_Time_singleCellNet,'singleCellNet_Testing_Time.csv',row.names = FALSE)
}
}
================================================
FILE: Snakemake/Cross_Validation.R
================================================
args <- commandArgs(TRUE)
Cross_Validation <- function(LabelsPath, col_Index = 1, OutputDir){
"
Cross_Validation
Function returns train and test indices for 5 folds stratified across unique cell populations,
also filter out cell populations with less than 10 cells.
It return a 'CV_folds.RData' file which then used as input to classifiers wrappers.
Parameters
----------
LabelsPath : Cell population annotations file path (.csv).
col_Index : column index (integer) defining which level of annotation to use,
in case of multiple cell type annotations (default is 1)
OutputDir : Output directory defining the path of the exported file.
"
Labels <- as.matrix(read.csv(LabelsPath))
Labels <- as.vector(Labels[,col_Index])
Removed_classes <- !(table(Labels) > 10)
Cells_to_Keep <- !(is.element(Labels,names(Removed_classes)[Removed_classes]))
Labels <- Labels[Cells_to_Keep]
# Getting training and testing Folds
library(rBayesianOptimization)
n_folds = 5
Folds <- KFold(Labels,nfolds = n_folds, stratified = TRUE)
Test_Folds <- c(n_folds:1)
Train_Idx <- list()
Test_Idx <- list()
for (i in c(1:length(Folds))){
Temp_Folds <- Folds
Temp_Folds[Test_Folds[i]] <- NULL
Train_Idx[i] <- list(unlist(Temp_Folds))
Test_Idx[i] <- Folds[Test_Folds[i]]
}
remove(Temp_Folds,i,Folds)
save(n_folds,Train_Idx,Test_Idx,col_Index,Cells_to_Keep,file = paste0(OutputDir, '/CV_folds.RData'))
}
Cross_Validation(args[1], as.numeric(args[2]), args[3])
================================================
FILE: Snakemake/DEgenesMAST.R
================================================
DEgenesMAST <- function(Data, Labels, Normalize = FALSE, LogTransform = FALSE){
# This functions applies a differential expression test to the data using one vs all
# The training data should be used a an input
# The output is a matrix with marker genes where the columns are the cell populations and the rows are the top20 marker genes
# This output can be rewritten to the format of the prior-knowledge-supervised classifiers and afterwards be used to classify the test set.
# Data: genes X cells (rows = genes, columns = cells)
# Labels: labels of the data
# Normalize: the input for MAST should be cpm normalized data,
# if the data is not normalized yet, this should be set to TRUE
# LogTransform: the input for MAST should be logtransformed,
# if the data is not logtransformed yet, this should be set to TRUE
library(Seurat)
if(Normalize)
{
Data <- apply(Data, 2, function(x) (x/sum(x))*1000000)
}
if(LogTransform)
{
Data <- log(Data+1, base = 2)
}
SeuObj <- CreateSeuratObject(raw.data = Data, project = "DEgenes")
SeuObj <- SetIdent(SeuObj, ident.use = Labels)
DEgenes <- FindAllMarkers(SeuObj, test.use = "MAST")
Markers <- matrix(nrow = 20,ncol = length(unique(Labels)))
colnames(Markers) <- unique(Labels)
for (i in unique(Labels)){
i
TempList <- DEgenes$gene[((DEgenes$cluster == i) & (DEgenes$avg_logFC > 0))]
MarkerGenes <- DEgenes$p_val_adj[DEgenes$cluster == i]
print(MarkerGenes[1:20])
if (length(TempList) >= 20){
Markers[,i] <- TempList[1:20]
}
else{
if(length(TempList) > 0){
Markers[c(1:length(TempList)),i] <- TempList
}
}
}
return(Markers)
}
================================================
FILE: Snakemake/Dockerfiles/baseline/Dockerfile
================================================
FROM debian:9.9-slim
# Install newest R version
RUN apt-get update && \
apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
apt-get update && \
apt-get install --no-install-recommends --yes r-base && \
apt-get purge --yes wget gnupg apt-transport-https && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Install python
RUN apt-get update && \
apt-get install --no-install-recommends --yes python3 python3-pip && \
pip3 --no-cache-dir install setuptools && \
pip3 --no-cache-dir install pandas rpy2 scikit-learn statsmodels && \
rm -rf /var/lib/apt/lists/*
COPY Scripts/run_kNN50.py \
Scripts/run_kNN9.py \
Scripts/run_LDA.py \
Scripts/run_LDA_rejection.py \
Scripts/run_NMC.py \
Scripts/run_RF.py \
Scripts/run_SVM.py \
Scripts/run_SVM_rejection.py \
rank_gene_dropouts.py \
/Scripts/
================================================
FILE: Snakemake/Dockerfiles/cell_blast/Dockerfile
================================================
FROM python:3.7-slim-stretch
# Install newest R version
RUN apt-get update && \
apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
apt-get update && \
apt-get install --no-install-recommends --yes r-base && \
apt-get purge --yes wget gnupg apt-transport-https && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Install python and pip deps
RUN apt-get update && apt-get install --no-install-recommends --yes build-essential python3-dev libxml2 libxml2-dev zlib1g-dev && \
pip3 --no-cache-dir install --upgrade pip && \
pip3 --no-cache-dir install --upgrade setuptools && \
pip3 --no-cache-dir install pandas rpy2 scikit-learn statsmodels tensorflow Cell-BLAST && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
COPY Scripts/run_Cell_BLAST.py /Scripts/
================================================
FILE: Snakemake/Dockerfiles/chetah/Dockerfile
================================================
FROM debian:9.9-slim
# Install newest R version
RUN apt-get update && \
apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
apt-get update && \
apt-get install --no-install-recommends --yes r-base && \
apt-get purge --yes wget gnupg apt-transport-https && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
COPY Scripts/run_CHETAH.R \
Dockerfiles/chetah/install_packages.R \
/Scripts/
# Install R packages
RUN apt-get update && \
apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
Rscript --vanilla /Scripts/install_packages.R && \
apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
================================================
FILE: Snakemake/Dockerfiles/chetah/install_packages.R
================================================
withCallingHandlers({
install.packages("devtools", repos="https://cloud.r-project.org/")
install.packages("BiocManager", repos="https://cloud.r-project.org/")
BiocManager::install(c("bioDist", "ggplot2", "gplots", "cowplot",
"dendextend", "corrplot", "reshape2", "plotly"))
devtools::install_github("jdekanter/CHETAH", ref="b777e6f671bff3c434842adb655869a52bc9e368")
},
warning = function(w) stop(w))
================================================
FILE: Snakemake/Dockerfiles/cross_validation/Dockerfile
================================================
FROM debian:9.9-slim
# Install newest R version
RUN apt-get update && \
apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
apt-get update && \
apt-get install --no-install-recommends --yes r-base && \
apt-get purge --yes wget gnupg apt-transport-https && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
COPY Cross_Validation.R \
Dockerfiles/cross_validation/install_packages.R \
/Scripts/
# Install R packages
RUN apt-get update && \
apt-get install --no-install-recommends --yes make gcc libc6-dev g++ libxml2-dev && \
Rscript --vanilla /Scripts/install_packages.R && \
apt-get purge --yes make gcc g++ libxml2-dev && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
================================================
FILE: Snakemake/Dockerfiles/cross_validation/install_packages.R
================================================
withCallingHandlers({
install.packages("lhs", repos="https://cloud.r-project.org/")
install.packages("rBayesianOptimization", repos="https://cloud.r-project.org/")
},
warning = function(w) stop(w))
================================================
FILE: Snakemake/Dockerfiles/garnett/Dockerfile
================================================
FROM debian:9.9-slim
# Install newest R version
RUN apt-get update && \
apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
apt-get update && \
apt-get install --no-install-recommends --yes r-base && \
apt-get purge --yes wget gnupg apt-transport-https && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
COPY Scripts/run_Garnett_CV.R \
Scripts/run_Garnett_Pretrained.R \
Dockerfiles/garnett/install_packages.R \
/Scripts/
# Install R packages
RUN apt-get update && \
apt-get install --no-install-recommends --yes make gcc g++ libxml2-dev zlib1g-dev gfortran liblapack-dev libcurl4-gnutls-dev libssl-dev && \
Rscript --vanilla /Scripts/install_packages.R && \
apt-get purge --yes make gcc g++ zlib1g-dev gfortran liblapack-dev libcurl4-gnutls-dev libssl-dev && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
================================================
FILE: Snakemake/Dockerfiles/garnett/install_packages.R
================================================
withCallingHandlers({
install.packages("BiocManager", repos="https://cloud.r-project.org/")
BiocManager::install(c("monocle", "DelayedArray", "DelayedMatrixStats",
"org.Hs.eg.db", "org.Mm.eg.db"))
install.packages("devtools", repos="https://cloud.r-project.org/")
devtools::install_github("cole-trapnell-lab/garnett", ref="9804b532bbcc1714b3ed0b718cf430741f1dba6c")
},
warning = function(w) stop(w))
================================================
FILE: Snakemake/Dockerfiles/scid/Dockerfile
================================================
FROM r-base:3.6.0
COPY Scripts/run_scID.R \
Dockerfiles/scid/install_packages.R \
/Scripts/
# Install R packages
RUN apt-get update && \
apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
Rscript --vanilla /Scripts/install_packages.R && \
apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
================================================
FILE: Snakemake/Dockerfiles/scid/install_packages.R
================================================
withCallingHandlers({
install.packages("BiocManager", repos="https://cloud.r-project.org/")
BiocManager::install(ask = FALSE);
BiocManager::install(c("scater", "MAST"))
install.packages("devtools", repos="https://cloud.r-project.org/")
devtools::install_github("satijalab/seurat")
devtools::install_github("BatadaLab/scID")
},
warning = function(w) stop(w))
================================================
FILE: Snakemake/Dockerfiles/scmap/Dockerfile
================================================
FROM r-base:3.6.0
COPY Scripts/run_scmapcell.R \
Scripts/run_scmapcluster.R \
Dockerfiles/scmap/install_packages.R \
/Scripts/
# Install R packages
RUN apt-get update && \
apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
Rscript --vanilla /Scripts/install_packages.R && \
apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
================================================
FILE: Snakemake/Dockerfiles/scmap/install_packages.R
================================================
withCallingHandlers({
install.packages("BiocManager", repos="https://cloud.r-project.org/")
BiocManager::install(ask = FALSE)
BiocManager::install("SingleCellExperiment")
install.packages("devtools", repos="https://cloud.r-project.org/")
devtools::install_github("hemberg-lab/scmap")
},
warning = function(w) stop(w))
================================================
FILE: Snakemake/Dockerfiles/scvi/Dockerfile
================================================
FROM python:3.7-slim-stretch
# Install newest R version
RUN apt-get update && \
apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
apt-get update && \
apt-get install --no-install-recommends --yes r-base && \
apt-get purge --yes wget gnupg apt-transport-https && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Install python and pip deps
RUN apt-get update && apt-get install --no-install-recommends --yes build-essential python3-dev libxml2 libxml2-dev zlib1g-dev && \
pip3 --no-cache-dir install --upgrade pip && \
pip3 --no-cache-dir install --upgrade setuptools && \
pip3 --no-cache-dir install pandas rpy2 scikit-learn statsmodels tensorflow scvi && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
COPY Scripts/run_scVI.py /Scripts/
================================================
FILE: Snakemake/Dockerfiles/singlecellnet/Dockerfile
================================================
FROM debian:9.9-slim
# Install newest R version
RUN apt-get update && \
apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
apt-get update && \
apt-get install --no-install-recommends --yes r-base && \
apt-get purge --yes wget gnupg apt-transport-https && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
COPY Scripts/run_singleCellNet.R \
Dockerfiles/singlecellnet/install_packages.R \
/Scripts/
# Install R packages
RUN apt-get update && \
apt-get install --no-install-recommends --yes make gcc libc6-dev g++ libcurl4-openssl-dev zlib1g-dev libssl-dev r-base-dev libxml2-dev && \
Rscript --vanilla /Scripts/install_packages.R && \
apt-get purge --yes make gcc g++ zlib1g-dev libcurl4-openssl-dev libc6-dev libssl-dev r-base-dev libxml2-dev && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
================================================
FILE: Snakemake/Dockerfiles/singlecellnet/install_packages.R
================================================
withCallingHandlers({
install.packages("devtools", repos="https://cloud.r-project.org/")
install.packages("BiocManager", repos="https://cloud.r-project.org/")
BiocManager::install("fgsea")
devtools::install_github("thomasp85/patchwork", ref="fd7958bae3e7a1e30237c751952e412a0a1d1242")
devtools::install_github("pcahan1/singleCellNet", ref="4279a68112743b783cc82628421dd703261ec117")
},
warning = function(w) stop(w))
================================================
FILE: Snakemake/Dockerfiles/singler/Dockerfile
================================================
FROM debian:9.9-slim
# Install newest R version
RUN apt-get update && \
apt-get install --no-install-recommends --yes wget gnupg apt-transport-https && \
wget -qO - http://keys.gnupg.net/pks/lookup?op=get\&search=0xAD5F960A256A04AF | apt-key add - && \
echo 'deb http://cloud.r-project.org/bin/linux/debian stretch-cran35/' >> /etc/apt/sources.list && \
apt-get update && \
apt-get install --no-install-recommends --yes r-base && \
apt-get purge --yes wget gnupg apt-transport-https && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
COPY Scripts/run_SingleR.R \
Dockerfiles/singler/install_packages.R \
/Scripts/
RUN apt-get update && \
apt-get install --no-install-recommends --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev libxml2 && \
Rscript --vanilla /Scripts/install_packages.R && \
apt-get purge --yes make gcc g++ r-base-dev libcurl4-openssl-dev libssl-dev libxml2-dev && \
apt-get autoremove --yes && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
================================================
FILE: Snakemake/Dockerfiles/singler/install_packages.R
================================================
withCallingHandlers({
install.packages("devtools", repos="https://cloud.r-project.org/")
install.packages("Seurat", repos="https://cloud.r-project.org/")
devtools::install_github("dviraran/SingleR", ref="db4823b380ba2c3142c857c8c0695200dd1736f6")
},
warning = function(w) stop(w))
================================================
FILE: Snakemake/LICENSE
================================================
MIT License
Copyright (c) 2019 tabdelaal
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: Snakemake/README.md
================================================
# scRNAseq_Benchmark
Benchmarking classification tools for scRNA-seq data
## How to use
[snakemake](https://snakemake.readthedocs.io/en/stable/index.html) and
[singularity](https://www.sylabs.io/docs/) need to be available on your
system. You will need to run this on a linux system, as singularity
only supports linux.
From the root of this repository:
```
snakemake \
--configfile <configfile> \
--use-singularity
```
If your data or output directory is not located under the root of this
repository, be sure to tell snakemake to mount the appropriate directories
in singularity:
```
snakemake \
--configfile <configfile> \
--use-singularity \
--singularity-args '--bind <location of inputs>:<location of inputs> --bind <output directory>:<output directory>'
```
#### The config file
```YML
output_dir: <path to outputs directory>
datafile: <path to csv file with counts per cell>
labfile: <csv with true labels per cell>
column: <The index of the column in the labels file which ought to be used, defaults to 1>
number_of_features: <number of features to be used as input for the classification methods, 0 means all, defaults to 0>
genes: <path to gene name list, only needed for garnett_CV and Garnett_Pretrained>
human: <whether or not the data is human, true means human, false means mouse, defaults to true>
tools_to_run: # List of tools to run
- <tool 1>
- <tool 2>
- <...>
```
##### Tool specific inputs
Some tools require specific inputs. Add the following to your config file when
one of these tools:
- Garnett_CV
```YML
Garnett_CV:
markers: <path to Gernett marker gene file>
```
- Garnett_Pretrained
```YML
Garnett_Pretrained:
classifier: <path to Gernett classifier>
```
<!-- TODO explain these input files -->
## Included tools/methods
- kNN50
- kNN9
- LDA
- LDA_rejection (LDA with rejection option)
- NMC
- RF
- SVM
- SVM (SVM with rejection option)
- [singleCellNet](https://github.com/pcahan1/singleCellNet)
- [CHETAH](https://github.com/jdekanter/CHETAH)
- [scmap](https://github.com/hemberg-lab/scmap)
- scmapcell
- scmapcluster
- [SingleR](https://github.com/dviraran/SingleR)
- [scID](https://github.com/BatadaLab/scID)
- [scVI](https://github.com/YosefLab/scVI)
- [Cell_BLAST](https://github.com/gao-lab/Cell_BLAST)
- [Garnett](https://cole-trapnell-lab.github.io/garnett/)
- Garnett_CV (without pretrained classifier)
- Garnett_Pretrained (with pretrained classifier)
## Adding new tools
In order to add a tool to this benchmarking workflow, a rule for this tool
needs to be added to the `Snakefile`. This rule should produce as output:
- a table of predicted label (`<output directory/<tool>/<tool>_pred.csv`).
- a table of true labels (`<output directory/<tool>/<tool>_true.csv`).
- a tables of testing, prediction and/or total time:
- `<output directory>/<tool>/<tool>_test_time.csv`
- `<output directory>/<tool>/<tool>_training_time.csv`
- `<output directory>/<tool>/<tool>_total_time.csv`
The input to this rule should be:
- a count table (specified as the `datafile` in the config).
- a true labels file (specified as the `labfile` in the config).
You will want to write a wrapper script for the tool you want to
add to facilitate this. The `"{output_dir}/CV_folds.RData"` input may be
used to provide your wrapper script with folds for cross_validation.
It is recommended to make a docker image containing all dependencies for both
the tool and any wrappers for the tool.
This wrapper script should also make a selection of the features to be used.
This selection should be based on ranking which can be accessed by providing
`feature ranking` as input to the wrapper script. The number of features to be
used should be configurable and settable through the 'number_of_features' field
in the config.
The following can be used as a template for new rules. Replace everything
surrounded by (and including the) `<>` with appropriate values.
```
rule SVM:
input:
datafile = config["datafile"],
labfile = config["labfile"],
folds = "{output_dir}/CV_folds.RData",
ranking = feature_ranking
output:
pred = "{output_dir}/<tool name>/<tool name>_pred.csv",
true = "{output_dir}/<tool name>/<tool name>_true.csv",
test_time = "{output_dir}/<tool name>/<tool name>_test_time.csv",
training_time = "{output_dir}/<tool name>/<tool name>_training_time.csv"
log: "{output_dir}/<tool name>/<tool name>.log"
params:
n_features = config.get("number_of_features", 0)
singularity: "docker://<docker image>"
shell:
"<python or Rscript> <wrapper script> "
"{input.datafile} "
"{input.labfile} "
"{input.folds} "
"{wildcards.output_dir}/<tool name> "
"{input.ranking} "
"{params.n_features} "
"&> {log}"
```
================================================
FILE: Snakemake/Scripts/run_ACTINN.py
================================================
import os
import numpy as np
import pandas as pd
import time as tm
import rpy2.robjects as robjects
def run_ACTINN(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run ACTINN
Wrapper script to run ACTINN on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
labels = labels.iloc[tokeep]
data = data.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
# folder with results
os.chdir(OutputDir)
tot=[]
truelab = []
pred = []
for i in range(np.squeeze(nfolds)):
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
train=data.iloc[train_ind_i]
test=data.iloc[test_ind_i]
y_train=labels.iloc[train_ind_i]
y_test=labels.iloc[test_ind_i]
if (NumGenes > 0):
feat_to_use = features.iloc[0:NumGenes,i]
train = train.iloc[:,feat_to_use]
test = test.iloc[:,feat_to_use]
train = train.transpose()
test = test.transpose()
train.to_csv("train.csv")
test.to_csv("test.csv")
y_train.to_csv("train_lab.csv", header = False, index = True, sep = '\t')
y_test.to_csv("test_lab.csv", header = False, index = True, sep = '\t')
tm.sleep(60)
os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_format.py -i train.csv -o train -f csv")
os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_format.py -i test.csv -o test -f csv")
start = tm.time()
os.system("python /home/nfs/lcmmichielsen/classifiers/ACTINN/actinn_predict.py -trs train.h5 -trl train_lab.csv -ts test.h5")
tot.append(tm.time()-start)
tm.sleep(60)
truelab.extend(y_test.values)
predlabels = pd.read_csv('predicted_label.txt',header=0,index_col=None, sep='\t', usecols = [1])
pred.extend(predlabels.values)
truelab = pd.DataFrame(truelab)
pred = pd.DataFrame(pred)
tot_time = pd.DataFrame(tot)
if (NumGenes == 0):
truelab.to_csv("ACTINN_True_Labels.csv", index = False)
pred.to_csv("ACTINN_Pred_Labels.csv", index = False)
tot_time.to_csv("ACTINN_Total_Time.csv", index = False)
else:
truelab.to_csv("ACTINN_" + str(NumGenes) + "_True_Labels.csv", index = False)
pred.to_csv("ACTINN_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
tot_time.to_csv("ACTINN_" + str(NumGenes) + "_Total_Time.csv", index = False)
================================================
FILE: Snakemake/Scripts/run_CHETAH.R
================================================
args <- commandArgs(TRUE)
run_CHETAH<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){
"
run CHETAH
Wrapper script to run CHETAH on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
"
Data <- read.csv(DataPath,row.names = 1)
Labels <- as.matrix(read.csv(LabelsPath))
load(CV_RDataPath)
Labels <- as.vector(Labels[,col_Index])
Data <- Data[Cells_to_Keep,]
Labels <- Labels[Cells_to_Keep]
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
GenesOrder = read.csv(GeneOrderPath)
}
#############################################################################
# CHETAH #
#############################################################################
library(CHETAH)
library(SingleCellExperiment)
True_Labels_CHETAH <- list()
Pred_Labels_CHETAH <- list()
Total_Time_CHETAH <- list()
Data = t(as.matrix(Data))
for (i in c(1:n_folds)){
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
sce <- SingleCellExperiment(assays = list(counts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Train_Idx[[i]]]),
colData = data.frame(celltypes = Labels[Train_Idx[[i]]]))
sce_test <- SingleCellExperiment(assays = list(counts = Data[as.vector(GenesOrder[c(1:NumGenes),i])+1,Test_Idx[[i]]]),
colData = data.frame(celltypes = Labels[Test_Idx[[i]]]))
start_time <- Sys.time()
sce_test <- CHETAHclassifier(input = sce_test, ref_cells = sce, n_genes = NumGenes)
end_time <- Sys.time()
}
else{
sce <- SingleCellExperiment(assays = list(counts = Data[,Train_Idx[[i]]]),
colData = data.frame(celltypes = Labels[Train_Idx[[i]]]))
sce_test <- SingleCellExperiment(assays = list(counts = Data[,Test_Idx[[i]]]),
colData = data.frame(celltypes = Labels[Test_Idx[[i]]]))
start_time <- Sys.time()
sce_test <- CHETAHclassifier(input = sce_test, ref_cells = sce)
end_time <- Sys.time()
}
Total_Time_CHETAH[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
True_Labels_CHETAH[i] <- list(Labels[Test_Idx[[i]]])
Pred_Labels_CHETAH[i] <- list(sce_test$celltype_CHETAH)
}
True_Labels_CHETAH <- as.vector(unlist(True_Labels_CHETAH))
Pred_Labels_CHETAH <- as.vector(unlist(Pred_Labels_CHETAH))
Total_Time_CHETAH <- as.vector(unlist(Total_Time_CHETAH))
write.csv(True_Labels_CHETAH,paste0(OutputDir,'/CHETAH_true.csv'),row.names = FALSE)
write.csv(Pred_Labels_CHETAH,paste0(OutputDir,'/CHETAH_pred.csv'),row.names = FALSE)
write.csv(Total_Time_CHETAH,paste0(OutputDir,'/CHETAH_total_time.csv'),row.names = FALSE)
}
if (args[6] == "0") {
run_CHETAH(args[1], args[2], args[3], args[4])
} else {
run_CHETAH(args[1], args[2], args[3], args[4], args[5], as.numeric(args[6]))
}
================================================
FILE: Snakemake/Scripts/run_CaSTLe.R
================================================
run_CaSTLe<-function(DataPath,LabelsPath,CV_RDataPath, OutputDir, GeneOrderPath = NULL, NumGenes = NULL){
"
run CaSTLe
Wrapper script to run CaSTLe on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is NULL.
"
Data <- read.csv(DataPath,row.names = 1)
Labels <- as.matrix(read.csv(LabelsPath))
load(CV_RDataPath)
Labels <- as.vector(Labels[,col_Index])
Data <- Data[Cells_to_Keep,]
Labels <- Labels[Cells_to_Keep]
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
GenesOrder = read.csv(GeneOrderPath)
}
#############################################################################
# CaSTLe #
#############################################################################
library(igraph)
library(xgboost)
True_Labels_Castle <- list()
Pred_Labels_Castle <- list()
Training_Time_Castle <- list()
Testing_Time_Castle <- list()
BREAKS=c(-1, 0, 1, 6, Inf)
nFeatures = 100
for(i in c(1:n_folds)){
# 1. Load datasets
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
ds1 = Data[Train_Idx[[i]],as.vector(GenesOrder[c(1:NumGenes),i])+1]
ds2 = Data[Test_Idx[[i]],as.vector(GenesOrder[c(1:NumGenes),i])+1]
}
else{
ds1 = Data[Train_Idx[[i]],]
ds2 = Data[Test_Idx[[i]],]
}
sourceCellTypes = as.factor(Labels[Train_Idx[[i]]])
targetCellTypes = as.factor(Labels[Test_Idx[[i]]])
start_time <- Sys.time()
# 2. Unify sets, excluding low expressed genes
source_n_cells_counts = apply(ds1, 2, function(x) { sum(x > 0) } )
target_n_cells_counts = apply(ds2, 2, function(x) { sum(x > 0) } )
common_genes = intersect( colnames(ds1)[source_n_cells_counts>10],
colnames(ds2)[target_n_cells_counts>10])
remove(source_n_cells_counts, target_n_cells_counts)
ds1 = ds1[, colnames(ds1) %in% common_genes]
ds2 = ds2[, colnames(ds2) %in% common_genes]
ds = rbind(ds1[,common_genes], ds2[,common_genes])
isSource = c(rep(TRUE,nrow(ds1)), rep(FALSE,nrow(ds2)))
remove(ds1, ds2)
# 3. Highest mean in both source and target
topFeaturesAvg = colnames(ds)[order(apply(ds, 2, mean), decreasing = T)]
end_time <- Sys.time()
Training_Time_Castle[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
start_time <- Sys.time()
# for each cell - what is the most probable classification?
L = length(levels(sourceCellTypes))
targetClassification = as.data.frame(matrix(rep(0,L*sum(!isSource)), nrow=L), row.names = levels(sourceCellTypes))
for (cellType in levels(sourceCellTypes)) {
inSourceCellType = as.factor(ifelse(sourceCellTypes == cellType, cellType, paste0("NOT",cellType)))
# 4. Highest mutual information in source
topFeaturesMi = names(sort(apply(ds[isSource,],2,function(x) { compare(cut(x,breaks=BREAKS),inSourceCellType,method = "nmi") }), decreasing = T))
# 5. Top n genes that appear in both mi and avg
selectedFeatures = union(head(topFeaturesAvg, nFeatures) , head(topFeaturesMi, nFeatures) )
# 6. remove correlated features
tmp = cor(ds[,selectedFeatures], method = "pearson")
tmp[!lower.tri(tmp)] = 0
selectedFeatures = selectedFeatures[apply(tmp,2,function(x) any(x < 0.9))]
remove(tmp)
# 7,8. Convert data from continous to binned dummy vars
# break datasets to bins
dsBins = apply(ds[, selectedFeatures], 2, cut, breaks= BREAKS)
# use only bins with more than one value
nUniq = apply(dsBins, 2, function(x) { length(unique(x)) })
# convert to dummy vars
ds0 = model.matrix(~ . , as.data.frame(dsBins[,nUniq>1]))
remove(dsBins, nUniq)
cat(paste0("<h2>Classifier for ",cellType,"</h2>"))
inTypeSource = sourceCellTypes == cellType
# 9. Classify
xg=xgboost(data=ds0[isSource,] ,
label=inTypeSource,
objective="binary:logistic",
eta=0.7 , nthread=1, nround=20, verbose=0,
gamma=0.001, max_depth=5, min_child_weight=10)
# 10. Predict
inTypeProb = predict(xg, ds0[!isSource, ])
targetClassification[cellType,] = inTypeProb
}
end_time <- Sys.time()
Testing_Time_Castle[i] <- as.numeric(difftime(end_time,start_time,units = 'secs'))
True_Labels_Castle[i] <- list(Labels[Test_Idx[[i]]])
Pred_Labels_Castle[i] <- list(rownames(targetClassification)[apply(targetClassification,2,which.max)])
}
True_Labels_Castle <- as.vector(unlist(True_Labels_Castle))
Pred_Labels_Castle <- as.vector(unlist(Pred_Labels_Castle))
Training_Time_Castle <- as.vector(unlist(Training_Time_Castle))
Testing_Time_Castle <- as.vector(unlist(Testing_Time_Castle))
if(!is.null(GeneOrderPath) & !is.null (NumGenes)){
write.csv(True_Labels_Castle,paste('True_Labels_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)
write.csv(Pred_Labels_Castle,paste('Pred_Labels_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)
write.csv(Training_Time_Castle,paste('Training_Time_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)
write.csv(Testing_Time_Castle,paste('Testing_Time_Castle_',NumGenes,'.csv', sep = ''),row.names = FALSE)
}
else{
write.csv(True_Labels_Castle,'True_Labels_CaSTLe.csv',row.names = FALSE)
write.csv(Pred_Labels_Castle,'Pred_Labels_CaSTLe.csv',row.names = FALSE)
write.csv(Training_Time_Castle,'Training_Time_CaSTLe.csv',row.names = FALSE)
write.csv(Testing_Time_Castle,'Testing_Time_CaSTLe.csv',row.names = FALSE)
}
}
================================================
FILE: Snakemake/Scripts/run_Cell_BLAST.py
================================================
import os
from sys import argv
from pathlib import Path
import time as tm
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import tensorflow as tf
tf.logging.set_verbosity(0)
import Cell_BLAST as cb
import numpy as np
from numpy import genfromtxt as gft
import rpy2.robjects as robjects
def run_Cell_BLAST(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run Cell_BLAST
Wrapper script to run Cell_BLAST on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
# read the data and labels
data_old = cb.data.ExprDataSet.read_table(DataPath,orientation="cg", sep=",", index_col = 0, header = 0, sparsify = True).normalize()
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
data = cb.data.ExprDataSet(data_old.exprs[tokeep],data_old.obs.iloc[tokeep],data_old.var,data_old.uns)
labels = gft(LabelsPath, dtype = "str", skip_header = 1, delimiter = ",", usecols = col)
labels = labels[tokeep]
truelab = []
pred = []
tr_time = []
ts_time = []
for i in range(np.squeeze(nfolds)):
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
train=data[train_ind_i,:]
test=data[test_ind_i,:]
y_train = labels[train_ind_i]
y_test = labels[test_ind_i]
if (NumGenes > 0):
feat_to_use = features.iloc[0:NumGenes,i]
train = train[:,feat_to_use]
test = test[:,feat_to_use]
train.obs['cell_type'] = y_train
start = tm.time()
# reduce dimensions
num_epoch = 50
models = []
for j in range(4):
models.append(cb.directi.fit_DIRECTi(train, epoch=num_epoch, patience=10, random_seed = j, path="%d" % j))
# train model
blast = cb.blast.BLAST(models, train).build_empirical()
tr_time.append(tm.time()-start)
# predict labels
start = tm.time()
test_pred = blast.query(test).annotate('cell_type')
ts_time.append(tm.time()-start)
truelab.extend(y_test)
pred.extend(test_pred.values)
#write results
truelab = pd.DataFrame(truelab)
pred = pd.DataFrame(pred)
tr_time = pd.DataFrame(tr_time)
ts_time = pd.DataFrame(ts_time)
truelab.to_csv(str(Path(OutputDir+"/Cell_BLAST_true.csv")),index = False)
pred.to_csv(str(Path(OutputDir+"/Cell_BLAST_pred.csv")),index = False)
tr_time.to_csv(str(Path(OutputDir+"/Cell_BLAST_training_time.csv")), index = False)
ts_time.to_csv(str(Path(OutputDir+"/Cell_BLAST_test_time.csv")),index = False)
run_Cell_BLAST(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))
================================================
FILE: Snakemake/Scripts/run_DigitalCellSorter.py
================================================
import numpy as np
import pandas as pd
import scripts.DigitalCellSorter as DigitalCellSorter
import os
import time as tm
import rpy2.robjects as robjects
def run_DigitalCellSorter(DataPath, LabelsPath, CV_RDataPath, GeneListPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run DigitalCellSorter
Wrapper script to run DigitalCellSorter on a benchmark dataset using a predefined genelist,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
GeneListPath : Data file path to the genest.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
data = data.iloc[tokeep]
truelab = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
truelab = truelab.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
feat_to_use = features.iloc[0:NumGenes,0]
data = data.iloc[:,feat_to_use]
data = data.transpose()
# number of different cell types in the data?
n_clusters = 8
AvailableCPUsCount = 1
N_samples_for_distribution = 10000
start = tm.time()
pred = DigitalCellSorter.DigitalCellSorter().Process(data, 'DigitalCellSorter_Zhang',
saveDir = OutputDir,
geneListFileName = GeneListPath,
N_samples_for_distribution = N_samples_for_distribution,
AvailableCPUsCount = AvailableCPUsCount,
clusterIndex=None,
clusterName=None,
n_clusters=n_clusters)
runtime = tm.time() - start
os.chdir(OutputDir)
results = pd.read_excel('DigitalCellSorter_Zhang_voting.xlsx',header=0,index_col=None, usecols=[11])
prediction = np.zeros(np.shape(pred), dtype='>U10')
for i in range(len(results)):
prediction[np.where(pred == i)] = results.values[i]
prediction = pd.DataFrame(prediction)
if (NumGenes == 0):
truelab.to_csv("DigitalCellSorter_True_Labels.csv", index = False)
prediction.to_csv("DigitalCellSorter_Pred_Labels.csv", index = False)
with open("DigitalCellSorter_Total_Time.csv", 'w') as f:
f.write("%f\n" % runtime)
else:
truelab.to_csv("DigitalCellSorter_" + str(NumGenes) + "_True_Labels.csv", index = False)
prediction.to_csv("DigitalCellSorter_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
with open("DigitalCellSorter_" + str(NumGenes) + "_Total_Time.csv", 'w') as f:
f.write("%f\n" % runtime)
================================================
FILE: Snakemake/Scripts/run_Garnett_CV.R
================================================
args <- commandArgs(TRUE)
run_Garnett_CV <- function(DataPath, LabelsPath, CV_RDataPath, GenesPath, MarkerPath, OutputDir, Human){
"
run Garnett
Wrapper script to run Garnett on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
GenesPath : Path to the file with the genenames
MarkerPath : Path to the file with marker genes
OutputDir : Output directory defining the path of the exported file.
Human : boolean indicating whether the dataset is human (TRUE) or mouse (FALSE)
"
# load needed libraries
library(garnett)
if (Human) {
library(org.Hs.eg.db)
} else {
library(org.Mm.eg.db)
}
# load the CVFile
load(CV_RDataPath)
# read the labels
labels <- as.matrix(read.csv(LabelsPath))
labels <- as.vector(labels[,col_Index])
labels <- labels[Cells_to_Keep]
# read the data
mat <- read.table(DataPath, sep = ",")
data <- mat[-1,-1]
data <- data[Cells_to_Keep,]
data <- t(data) #ensure that the genes are rows, and the cells are columns
cells <- mat[-1,1]
cells <- cells[Cells_to_Keep]
# read the genefile
fdata <- read.table(GenesPath)
names(fdata) <- 'gene_short_name'
row.names(fdata) <- fdata$gene_short_name
fd <- new("AnnotatedDataFrame", data = fdata)
true_labels <- list()
pred_labels <- list()
train_time <- list()
test_time <- list()
for (i in c(1:n_folds)){
lab_train = labels[Train_Idx[[i]]]
lab_test = labels[Test_Idx[[i]]]
train = data[,Train_Idx[[i]]]
test = data[,Test_Idx[[i]]]
cells_train = cells[Train_Idx[[i]]]
cells_test = cells[Test_Idx[[i]]]
pdata_train = data.frame(cells_train)
pdata_test = data.frame(cells_test)
row.names(train) <- row.names(fdata)
row.names(test) <- row.names(fdata)
colnames(train) <- row.names(pdata_train)
colnames(test) <- row.names(pdata_test)
pd_train <- new("AnnotatedDataFrame", data = pdata_train)
pd_test <- new("AnnotatedDataFrame", data = pdata_test)
pbmc_cds_train <- newCellDataSet(as(train, "dgCMatrix"), phenoData = pd_train, featureData = fd)
pbmc_cds_test <- newCellDataSet(as(test, "dgCMatrix"), phenoData = pd_test, featureData = fd)
pbmc_cds_train <- estimateSizeFactors(pbmc_cds_train)
pbmc_cds_test <- estimateSizeFactors(pbmc_cds_test)
# training
start_train <- Sys.time()
if (Human){
pbmc_classifier <- train_cell_classifier(cds = pbmc_cds_train,
marker_file = MarkerPath,
db=org.Hs.eg.db,
cds_gene_id_type = "SYMBOL",
num_unknown = 50,
marker_file_gene_id_type = "SYMBOL")
} else {
pbmc_classifier <- train_cell_classifier(cds = pbmc_cds_train,
marker_file = MarkerPath,
db=org.Mm.eg.db,
cds_gene_id_type = "SYMBOL",
num_unknown = 50,
marker_file_gene_id_type = "SYMBOL")
}
end_train <- Sys.time()
train_time[i] <- as.numeric(end_train - start_train)
# testing
start_test <- Sys.time()
if (Human) {
pbmc_cds_test <- classify_cells(pbmc_cds_test,
pbmc_classifier,
db = org.Hs.eg.db,
cluster_extend = TRUE,
cds_gene_id_type = "SYMBOL")
} else {
pbmc_cds_test <- classify_cells(pbmc_cds_test,
pbmc_classifier,
db = org.Mm.eg.db,
cluster_extend = TRUE,
cds_gene_id_type = "SYMBOL")
}
end_test <- Sys.time()
test_time[i] <- as.numeric(end_test - start_test)
true_labels[i] <- list(lab_test)
pred_labels[i] <- list(pData(pbmc_cds_test)$cluster_ext_type)
}
true_labels <- as.vector(unlist(true_labels))
pred_labels <- as.vector(unlist(pred_labels))
train_time <- as.vector(unlist(train_time))
test_time <- as.vector(unlist(test_time))
write.csv(true_labels,paste0(OutputDir,'/Garnett_CV_true.csv'),row.names = FALSE)
write.csv(pred_labels,paste0(OutputDir,'/Garnett_CV_pred.csv'),row.names = FALSE)
write.csv(train_time,paste0(OutputDir,'/Garnett_CV_training_time.csv'),row.names = FALSE)
write.csv(test_time,paste0(OutputDir,'/Garnett_CV_test_time.csv'),row.names = FALSE)
}
run_Garnett_CV(args[1], args[2], args[3], args[4], args[5], args[6], args[7])
================================================
FILE: Snakemake/Scripts/run_Garnett_Pretrained.R
================================================
args <- commandArgs(TRUE)
run_Garnett_Pretrained <- function(DataPath, LabelsPath, GenesPath, CV_RDataPath, ClassifierPath, OutputDir, Human){
"
run Garnett
Wrapper script to run Garnett on a benchmark dataset with a pretrained classifier,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
GenesPath : Path to the file with the genenames
ClassifierPath : Path to the pretrained classifier
OutputDir : Output directory defining the path of the exported file.
Human : boolean indicating whether the dataset is human (TRUE) or mouse (FALSE)
"
# load needed libraries
library(garnett)
if (Human) {
library(org.Hs.eg.db)
} else {
library(org.Mm.eg.db)
}
# load data, genes, and marker file
load(CV_RDataPath)
load(ClassifierPath)
labels <- as.matrix(read.csv(LabelsPath))
labels <- labels[Cells_to_Keep]
mat <- read.table(DataPath, sep = ",")
data <- mat[-1,-1]
data <- data[Cells_to_Keep,]
data <- t(data) #ensure that the genes are rows, and the cells are columns
barcodes <- mat[-1,1]
pdata = data.frame(barcodes)
fdata <- read.table(GenesPath)
names(fdata) <- 'gene_short_name'
row.names(fdata) <- fdata$gene_short_name
row.names(data) <- row.names(fdata)
colnames(data) <- row.names(pdata)
pd <- new("AnnotatedDataFrame", data = pdata)
fd <- new("AnnotatedDataFrame", data = fdata)
pbmc_cds <- newCellDataSet(as(data, "dgCMatrix"),
phenoData = pd,
featureData = fd)
start_time <- Sys.time()
pbmc_cds <- estimateSizeFactors(pbmc_cds)
if (Human){
pbmc_cds <- classify_cells(pbmc_cds, hsPBMC, db = org.Hs.eg.db, cluster_extend = TRUE, cds_gene_id_type = "SYMBOL")
} else {
pbmc_cds <- classify_cells(pbmc_cds, mmLung, db = org.Mm.eg.db, cluster_extend = TRUE, cds_gene_id_type = "SYMBOL")
}
end_time <- Sys.time()
test_time <- as.numeric(end_time - start_time)
write.table(pData(pbmc_cds)$cluster_ext_type,
file = paste0(OutputDir, "/Garnett_Pretrained_pred.csv"), append = FALSE, quote = TRUE, sep = "\t",
eol = "\n", na = "NA", dec = ".", row.names = FALSE,
qmethod = c("escape", "double"),
fileEncoding = "")
write.csv(labels,paste0(OutputDir,"/Garnett_Pretrained_true.csv"), row.names = FALSE)
write.csv(test_time,paste0(OutputDir,'/Garnett_Pretrained_test_time.csv'),row.names = FALSE)
}
run_Garnett_Pretrained(args[1], args[2], args[3], args[4], args[5], args[6], args[7])
================================================
FILE: Snakemake/Scripts/run_LAmbDA.py
================================================
# -*- coding: utf-8 -*-
"""
Created on Thu May 23 13:51:15 2019
@author: Lieke
"""
import os
import numpy as np
import pandas as pd
import time as tm
import rpy2.robjects as robjects
import tensorflow as tf
import math
import scipy.io as sio
import optunity as opt
from tensorflow.contrib.tensor_forest.python import tensor_forest
from tensorflow.python.ops import resources
def run_LAmbDA(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run LAmbDA classifier
Wrapper script to run LAmbDA on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
labels = labels.iloc[tokeep]
data = data.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
# folder with results
os.chdir(OutputDir)
tr_time=[]
ts_time=[]
truelab = np.zeros([len(labels),1],dtype = int)
predlab = np.zeros([len(labels),1],dtype = int)
for i in range(np.squeeze(nfolds)):
global X, Y, Gnp, Dnp, train, test, prt, cv
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
X = np.array(data)
if (NumGenes > 0):
X = np.log2(X/10+1)
feat_to_use = features.iloc[0:NumGenes,i]
X = X[:,feat_to_use]
else:
X = np.log2(np.transpose(select_feats(np.transpose(X),0.5,80))/10+1)
uniq = np.unique(labels)
Y = np.zeros([len(labels),len(uniq)],int)
for j in range(len(uniq)):
Y[np.where(labels == uniq[j])[0],j] = 1
Y = np.array(Y)
Gnp = np.zeros([len(uniq),len(uniq)],int)
np.fill_diagonal(Gnp,1)
Gnp = np.array(Gnp)
Dnp = np.ones([len(uniq),1],int)
Dnp = np.array(Dnp)
train_samp = int(np.floor(0.75*len(train_ind_i)))
test_samp = len(train_ind_i) - train_samp
perm = np.random.permutation(len(train_ind_i))
train = perm[0:train_samp]
test = perm[train_samp:test_samp+1]
while(np.sum(np.sum(Y[train,:],0)<5)>0):
perm = np.random.permutation(X.shape[0])
train = perm[0:train_samp+1]
test = perm[train_samp+1:train_samp+test_samp+1]
cv = i
optunity_it = 0
prt = False
opt_params = None
start=tm.time()
opt_params, _, _ = opt.minimize(run_LAmbDA2,solver_name='sobol', gamma=[0.8,1.2], delta=[0.05,0.95], tau=[10.0,11.0], prc_cut=[20,50], bs_prc=[0.2,0.6], num_trees=[10,200], max_nodes=[100,1000], num_evals=50)
tr_time.append(tm.time()-start)
print("Finished training!")
prt = True
train = train_ind_i
test = test_ind_i
start=tm.time()
err = run_LAmbDA2(opt_params['gamma'], opt_params['delta'], opt_params['tau'], opt_params['prc_cut'], opt_params['bs_prc'], opt_params['num_trees'], opt_params['max_nodes'])
ts_time.append(tm.time()-start)
tf.reset_default_graph();
predfile = 'preds_cv' + str(cv) + '.mat'
truefile = 'truth_cv' + str(cv) + '.mat'
pred = sio.loadmat(predfile)
truth = sio.loadmat(truefile)
pred = pred['preds']
truth = truth['labels']
pred_ind = np.argmax(pred,axis=1)
truth_ind = np.argmax(truth,axis=1)
predlab[test_ind_i,0] = pred_ind
truelab[test_ind_i,0] = truth_ind
truelab = pd.DataFrame(truelab)
predlab = pd.DataFrame(predlab)
tr_time = pd.DataFrame(tr_time)
ts_time = pd.DataFrame(ts_time)
if (NumGenes == 0):
truelab.to_csv("LAmbDA_True_Labels.csv", index = False)
predlab.to_csv("LAmbDA_Pred_Labels.csv", index = False)
tr_time.to_csv("LAmbDA_Training_Time.csv", index = False)
ts_time.to_csv("LAmbDA_Testing_Time.csv", index = False)
else:
truelab.to_csv("LAmbDA_" + str(NumGenes) + "_True_Labels.csv", index = False)
predlab.to_csv("LAmbDA_" + str(NumGenes) + "_Pred_Labels.csv", index = False)
tr_time.to_csv("LAmbDA_" + str(NumGenes) + "_Training_Time.csv", index = False)
ts_time.to_csv("LAmbDA_" + str(NumGenes) + "_Testing_Time.csv", index = False)
##### Functions copied from LAmbDA's Github
def wt_cutoff(colnum,cutoff,Gtmp,gamma):
rowsums = np.sum(Gtmp,axis=1);
return(math.ceil(cutoff*(math.log((max(rowsums)/rowsums[colnum])+1,2)**gamma)))
def resample(prc_cut,Y,Gtmp,train,gamma):
add = list()
rem = list()
colsums = np.sum(Y[train,:],axis=0);
cutoff = math.ceil(np.percentile(colsums,prc_cut));
for i in range(len(colsums)):
if colsums[i] == 0:
pass
elif colsums[i] < wt_cutoff(i,cutoff,Gtmp,gamma):
idx = np.squeeze(np.array(np.where(Y[train,i]>=1)));
choice = np.random.choice(train[idx],int(wt_cutoff(i,cutoff,Gtmp,gamma)-colsums[i]))
add = add + choice.tolist();
elif colsums[i] == wt_cutoff(i,cutoff,Gtmp,gamma):
pass
else:
idx = np.squeeze(np.array(np.where(Y[train,i]>=1)));
choice = np.random.choice(train[idx],int(colsums[i]-wt_cutoff(i,cutoff,Gtmp,gamma)),replace=False)
rem = rem + choice.tolist()
return np.concatenate((list([val for val in train if val not in rem]),add));
def select_feats(Xtmp,num_zero_prc_cut,var_prc_cut):
#*********************************************************************
# remove features with many zeros
num_feat_zeros = np.sum(Xtmp==0,axis=1);
Xtmp = Xtmp[num_feat_zeros<num_zero_prc_cut*Xtmp.shape[1],:]
#*********************************************************************
# remove features with low variance
feat_vars = np.var(Xtmp,axis=1)
Xtmp = Xtmp[feat_vars>np.percentile(feat_vars,var_prc_cut),:]
return(Xtmp)
def get_yn(predict,ys,delta,tau,output_feats):
D = tf.cast(Dnp, tf.float32);
G = tf.cast(Gnp, tf.float32);
ys = tf.cast(ys, tf.float32);
#print("start")
Cm = tf.matmul(tf.transpose(tf.matmul(ys,D)),predict+0.1)/tf.reshape(tf.reduce_sum(tf.transpose(tf.matmul(ys,D)),1),(-1,1));
#print("1")
mCm = tf.reshape(tf.reduce_mean(tf.cast(tf.matmul(tf.transpose(D),G)>0,tf.float32)*Cm,1),(-1,1));
#print("2")
yw = tf.multiply(predict+0.1,tf.matmul(tf.matmul(ys,D),tf.pow(mCm/Cm,tau)));
#print("3")
ye = tf.multiply(tf.matmul(ys,G),yw);
#print("4")
yt = tf.matmul(ys,tf.matmul(tf.transpose(ys),ye));
#print("5")
ya = (delta*yt)+((1-delta)*ye)
#print("6")
yn = tf.cast(tf.one_hot(tf.argmax(ya,axis=1),output_feats), dtype=tf.float32)
#print("7")
return(yn)
def get_yi(rowsums,G2,ys):
G2 = tf.cast(G2, tf.float32);
ys = tf.cast(ys, tf.float32);
yi = tf.cast(tf.matmul(ys,G2), dtype=tf.float32);
return(yi)
def run_LAmbDA2(gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes):
global X, Y, Gnp, Dnp, train, test, prt, cv
D = tf.cast(Dnp, tf.float32);
G = tf.cast(Gnp, tf.float32);
#optunity_it = optunity_it+1;
num_trees = int(num_trees);
max_nodes = int(max_nodes);
prc_cut = int(np.ceil(prc_cut));
print("gamma=%.4f, delta=%.4f, tau=%.4f, prc_cut=%i, bs_prc=%.4f, num_trees=%i, max_nodes=%i" % (gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes))
input_feats = X.shape[1];
num_labls = G.shape.as_list();
output_feats = num_labls[1];
#print(output_feats)
num_labls = num_labls[0];
rowsums = np.sum(Gnp,axis=1);
train2 = resample(prc_cut, Y, Gnp, train, gamma); # Bug??
bs = int(np.ceil(bs_prc*train2.size))
xs = tf.placeholder(tf.float32, [None,input_feats])
#ys = tf.placeholder(tf.float32, [None,num_labls])
yin = tf.placeholder(tf.int32, [None])
print("Vars loaded xs and ys created")
hparams = tensor_forest.ForestHParams(num_classes=output_feats,
num_features=input_feats,
num_trees=num_trees,
max_nodes=max_nodes).fill()
print("Tensor forest hparams created")
forest_graph = tensor_forest.RandomForestGraphs(hparams)
print("Tensor forest graph created")
train_op = forest_graph.training_graph(xs, yin)
loss_op = forest_graph.training_loss(xs, yin)
print("Loss and train ops created")
predict, _, _ = forest_graph.inference_graph(xs)
print("Tensor forest variables created through predict")
accuracy_op = tf.reduce_mean(tf.reduce_sum(tf.square(tf.one_hot(yin,output_feats)-predict),reduction_indices=[1]))
print(tf.reduce_sum(tf.square(tf.one_hot(yin,output_feats)-predict),reduction_indices=[1]))
#predict = tf.one_hot(pred);
print("Lambda specific variables created")
# Creating training and testing steps
G2 = np.copy(Gnp);
G2[rowsums>1,:] = 0;
YI = np.matmul(Y,G2);
YIrs = np.sum(YI,axis=1);
trainI = train2[np.in1d(train2,np.where(YIrs==1))];
print("data type trainI,",trainI.dtype)
testI = test[np.in1d(test,np.where(YIrs==1))];
print("trainI testI created")
#init_vars=tf.global_variables_initializer()
init_vars = tf.group(tf.global_variables_initializer(),
resources.initialize_resources(resources.shared_resources()))
sess = tf.Session()
sess.run(init_vars)
print("Session started")
#beep = sess.run(predict,feed_dict={xs:X[1:100,:]});
#beep = sess.run(predict,feed_dict={xs:X[train2[0:bs],:]});
tensor_trainI = {xs: X[trainI, :], yin: sess.run(tf.argmax(get_yi(rowsums,G2,Y[trainI, :]),axis=1))}
print("tensor_trainI made")
tensor_testI = {xs: X[testI, :], yin: sess.run(tf.argmax(get_yi(rowsums,G2,Y[testI, :]),axis=1))}
print("tensor_testI made")
tensor_train = {xs: X[train2[0:bs], :], yin: sess.run(tf.argmax(get_yn(sess.run(predict,feed_dict={xs:X[train2[0:bs],:]}),Y[train2[0:bs], :],delta,tau,output_feats),axis=1))}
print("tensor_train made")
tensor_test = {xs: X[test, :], yin: sess.run(tf.argmax(get_yn(sess.run(predict,feed_dict={xs:X[test,:]}),Y[test, :],delta,tau,output_feats),axis=1))}
print("tensor_test made")
#**********************************
#print("Loss and training steps created with sample tensors")
# Setting params and initializing
print("Beginning iterations")
# Starting training iterations
print(X.shape)
for i in range(1,101):
if i < 50:
sess.run(train_op, feed_dict=tensor_trainI)
#print("ran train op")
if i % 10 == 0:
print(str(sess.run(accuracy_op, feed_dict=tensor_trainI)) + ' ' + str(sess.run(accuracy_op, feed_dict=tensor_testI)))
else:
sess.run(train_op, feed_dict=tensor_train)
if i % 10 == 0:
print(str(sess.run(accuracy_op, feed_dict=tensor_train)) + ' ' + str(sess.run(accuracy_op, feed_dict=tensor_test)))
elif i % 10 == 0:
np.random_shuffle(train2);
tensor_train = {xs: X[train2[0:bs], :], yin: sess.run(get_yn(sess.run(predict,feed_dict={xs:X[train2[0:bs],:]}),Y[train2[0:bs], :],delta,tau,output_feats))}
if prt:
blah = sess.run(predict, feed_dict=tensor_test);
sio.savemat('preds_cv' + str(cv) + '.mat', {'preds': blah});
sio.savemat('truth_cv' + str(cv) + '.mat', {'labels': Y[test, :]});
acc = sess.run(accuracy_op, feed_dict=tensor_test)
print("loss1=%.4f, gamma=%.4f, delta=%.4f, tau=%.4f, prc_cut=%i, bs_prc=%.4f, num_trees=%i, max_nodes=%i" % (acc, gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes))
tf.reset_default_graph();
return(acc)
================================================
FILE: Snakemake/Scripts/run_LDA.py
================================================
import os
from sys import argv
from pathlib import Path
import numpy as np
import pandas as pd
import time as tm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import rpy2.robjects as robjects
def run_LDA(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
'''
run baseline classifier: LDA
Wrapper script to run LDA classifier on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
labels = labels.iloc[tokeep]
data = data.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')
# normalize data
data = np.log1p(data)
Classifier = LinearDiscriminantAnalysis()
tr_time=[]
ts_time=[]
truelab = []
pred = []
for i in range(np.squeeze(nfolds)):
test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
train_ind_i = np.array(train_ind[i], dtype = 'int') - 1
train=data.iloc[train_ind_i]
test=data.iloc[test_ind_i]
y_train=labels.iloc[train_ind_i]
y_test=labels.iloc[test_ind_i]
if (NumGenes > 0):
feat_to_use = features.iloc[0:NumGenes,i]
train = train.iloc[:,feat_to_use]
test = test.iloc[:,feat_to_use]
start=tm.time()
Classifier.fit(train, y_train)
tr_time.append(tm.time()-start)
start=tm.time()
predicted = Classifier.predict(test)
ts_time.append(tm.time()-start)
truelab.extend(y_test.values)
pred.extend(predicted)
truelab = pd.DataFrame(truelab)
pred = pd.DataFrame(pred)
tr_time = pd.DataFrame(tr_time)
ts_time = pd.DataFrame(ts_time)
OutputDir = Path(OutputDir)
truelab.to_csv(str(OutputDir / Path("LDA_true.csv")),
index = False)
pred.to_csv(str(OutputDir / Path("LDA_pred.csv")),
index = False)
tr_time.to_csv(str(OutputDir / Path("LDA_training_time.csv")),
index = False)
ts_time.to_csv(str(OutputDir / Path("LDA_test_time.csv")),
index = False)
run_LDA(argv[1], argv[2], argv[3], argv[4], argv[5], int(argv[6]))
================================================
FILE: Snakemake/Scripts/run_LDA_rejection.py
================================================
import os
from sys import argv
from pathlib import Path
import numpy as np
import pandas as pd
import time as tm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import rpy2.robjects as robjects
def run_LDA_rejection(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0, Threshold = 0.7):
'''
run baseline classifier: LDA
Wrapper script to run LDA classifier on a benchmark dataset with 5-fold cross validation,
outputs lists of true and predicted cell labels as csv files, as well as computation time.
Parameters
----------
DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
as row names and gene names as column names.
LabelsPath : Cell population annotations file path (.csv).
CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
OutputDir : Output directory defining the path of the exported file.
GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
defining the genes order for each cross validation fold, default is NULL.
NumGenes : Number of genes used in case of feature selection (integer), default is 0.
Threshold : Threshold used when rejecting the genes, default is 0.7.
'''
# read the Rdata file
robjects.r['load'](CV_RDataPath)
nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
col = np.array(robjects.r['col_Index'], dtype = 'int')
col = col - 1
test_ind = np.array(robjects.r['Test_Idx'])
train_ind = np.array(robjects.r['Train_Idx'])
# read the data
data = pd.read_csv(DataPath,index_col=0,sep=',')
labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
labels = labels.iloc[tokeep]
data = data.iloc[tokeep]
# read the feature file
if (NumGenes > 0):
features = pd.read_csv(GeneOrderPath,h
gitextract_ikyozzhh/ ├── Cross_Validation.R ├── DEgenesMAST.R ├── LICENSE ├── README.md ├── Scripts/ │ ├── run_ACTINN.py │ ├── run_CHETAH.R │ ├── run_CaSTLe.R │ ├── run_Cell_BLAST.py │ ├── run_DigitalCellSorter.py │ ├── run_Garnett_CV.R │ ├── run_Garnett_Pretrained.R │ ├── run_LAmbDA.py │ ├── run_LDA.py │ ├── run_LDA_rejection.py │ ├── run_NMC.py │ ├── run_RF.py │ ├── run_SCINA.R │ ├── run_SVM.py │ ├── run_SVM_rejection.py │ ├── run_SingleR.R │ ├── run_kNN50.py │ ├── run_kNN9.py │ ├── run_moana.py │ ├── run_scID.R │ ├── run_scPred.R │ ├── run_scVI.py │ ├── run_scmap.R │ └── run_singleCellNet.R ├── Snakemake/ │ ├── Cross_Validation.R │ ├── DEgenesMAST.R │ ├── Dockerfiles/ │ │ ├── baseline/ │ │ │ └── Dockerfile │ │ ├── cell_blast/ │ │ │ └── Dockerfile │ │ ├── chetah/ │ │ │ ├── Dockerfile │ │ │ └── install_packages.R │ │ ├── cross_validation/ │ │ │ ├── Dockerfile │ │ │ └── install_packages.R │ │ ├── garnett/ │ │ │ ├── Dockerfile │ │ │ └── install_packages.R │ │ ├── scid/ │ │ │ ├── Dockerfile │ │ │ └── install_packages.R │ │ ├── scmap/ │ │ │ ├── Dockerfile │ │ │ └── install_packages.R │ │ ├── scvi/ │ │ │ └── Dockerfile │ │ ├── singlecellnet/ │ │ │ ├── Dockerfile │ │ │ └── install_packages.R │ │ └── singler/ │ │ ├── Dockerfile │ │ └── install_packages.R │ ├── LICENSE │ ├── README.md │ ├── Scripts/ │ │ ├── run_ACTINN.py │ │ ├── run_CHETAH.R │ │ ├── run_CaSTLe.R │ │ ├── run_Cell_BLAST.py │ │ ├── run_DigitalCellSorter.py │ │ ├── run_Garnett_CV.R │ │ ├── run_Garnett_Pretrained.R │ │ ├── run_LAmbDA.py │ │ ├── run_LDA.py │ │ ├── run_LDA_rejection.py │ │ ├── run_NMC.py │ │ ├── run_RF.py │ │ ├── run_SCINA.R │ │ ├── run_SVM.py │ │ ├── run_SVM_rejection.py │ │ ├── run_SingleR.R │ │ ├── run_kNN50.py │ │ ├── run_kNN9.py │ │ ├── run_moana.py │ │ ├── run_scID.R │ │ ├── run_scPred.R │ │ ├── run_scVI.py │ │ ├── run_scmap.R │ │ ├── run_scmapcell.R │ │ ├── run_scmapcluster.R │ │ ├── run_scmaptotal.R │ │ └── run_singleCellNet.R │ ├── Snakefile │ ├── evaluate.R │ ├── example.config.yml │ └── rank_gene_dropouts.py ├── evaluate.R └── rank_gene_dropouts.py
SYMBOL INDEX (42 symbols across 30 files) FILE: Scripts/run_ACTINN.py function run_ACTINN (line 7) | def run_ACTINN(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderP... FILE: Scripts/run_Cell_BLAST.py function run_Cell_BLAST (line 16) | def run_Cell_BLAST(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOr... FILE: Scripts/run_DigitalCellSorter.py function run_DigitalCellSorter (line 8) | def run_DigitalCellSorter(DataPath, LabelsPath, CV_RDataPath, GeneListPa... FILE: Scripts/run_LAmbDA.py function run_LAmbDA (line 21) | def run_LAmbDA(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderP... function wt_cutoff (line 162) | def wt_cutoff(colnum,cutoff,Gtmp,gamma): function resample (line 166) | def resample(prc_cut,Y,Gtmp,train,gamma): function select_feats (line 186) | def select_feats(Xtmp,num_zero_prc_cut,var_prc_cut): function get_yn (line 197) | def get_yn(predict,ys,delta,tau,output_feats): function get_yi (line 218) | def get_yi(rowsums,G2,ys): function run_LAmbDA2 (line 224) | def run_LAmbDA2(gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes): FILE: Scripts/run_LDA.py function run_LDA (line 9) | def run_LDA(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath... FILE: Scripts/run_LDA_rejection.py function run_LDA_rejection (line 9) | def run_LDA_rejection(DataPath, LabelsPath, CV_RDataPath, OutputDir, Gen... FILE: Scripts/run_NMC.py function run_NMC (line 9) | def run_NMC(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath... FILE: Scripts/run_RF.py function run_RF (line 9) | def run_RF(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath ... FILE: Scripts/run_SVM.py function run_SVM (line 9) | def run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath... FILE: Scripts/run_SVM_rejection.py function run_SVM (line 10) | def run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath... FILE: Scripts/run_kNN50.py function run_kNN50 (line 9) | def run_kNN50(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPa... FILE: Scripts/run_kNN9.py function run_kNN9 (line 9) | def run_kNN9(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPat... FILE: Scripts/run_moana.py function run_moana (line 9) | def run_moana(DataPath, LabelsPath, ClassifierPath, OutputDir, GeneOrder... FILE: Scripts/run_scVI.py function run_scVI (line 10) | def run_scVI(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPat... FILE: Snakemake/Scripts/run_ACTINN.py function run_ACTINN (line 7) | def run_ACTINN(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderP... FILE: Snakemake/Scripts/run_Cell_BLAST.py function run_Cell_BLAST (line 18) | def run_Cell_BLAST(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOr... FILE: Snakemake/Scripts/run_DigitalCellSorter.py function run_DigitalCellSorter (line 8) | def run_DigitalCellSorter(DataPath, LabelsPath, CV_RDataPath, GeneListPa... FILE: Snakemake/Scripts/run_LAmbDA.py function run_LAmbDA (line 21) | def run_LAmbDA(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderP... function wt_cutoff (line 162) | def wt_cutoff(colnum,cutoff,Gtmp,gamma): function resample (line 166) | def resample(prc_cut,Y,Gtmp,train,gamma): function select_feats (line 186) | def select_feats(Xtmp,num_zero_prc_cut,var_prc_cut): function get_yn (line 197) | def get_yn(predict,ys,delta,tau,output_feats): function get_yi (line 218) | def get_yi(rowsums,G2,ys): function run_LAmbDA2 (line 224) | def run_LAmbDA2(gamma, delta, tau, prc_cut, bs_prc, num_trees, max_nodes): FILE: Snakemake/Scripts/run_LDA.py function run_LDA (line 12) | def run_LDA(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath... FILE: Snakemake/Scripts/run_LDA_rejection.py function run_LDA_rejection (line 12) | def run_LDA_rejection(DataPath, LabelsPath, CV_RDataPath, OutputDir, Gen... FILE: Snakemake/Scripts/run_NMC.py function run_NMC (line 12) | def run_NMC(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath... FILE: Snakemake/Scripts/run_RF.py function run_RF (line 12) | def run_RF(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath ... FILE: Snakemake/Scripts/run_SVM.py function run_SVM (line 12) | def run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath... FILE: Snakemake/Scripts/run_SVM_rejection.py function run_SVM (line 13) | def run_SVM(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath... FILE: Snakemake/Scripts/run_kNN50.py function run_kNN50 (line 12) | def run_kNN50(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPa... FILE: Snakemake/Scripts/run_kNN9.py function run_kNN9 (line 12) | def run_kNN9(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPat... FILE: Snakemake/Scripts/run_moana.py function run_moana (line 9) | def run_moana(DataPath, LabelsPath, ClassifierPath, OutputDir, GeneOrder... FILE: Snakemake/Scripts/run_scVI.py function run_scVI (line 13) | def run_scVI(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPat... FILE: Snakemake/rank_gene_dropouts.py function rank_gene_dropouts (line 11) | def rank_gene_dropouts(DataPath, CV_RDataPath, OutputDir): FILE: rank_gene_dropouts.py function rank_gene_dropouts (line 8) | def rank_gene_dropouts(DataPath, CV_RDataPath, OutputDir):
Condensed preview — 82 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (316K chars).
[
{
"path": "Cross_Validation.R",
"chars": 1463,
"preview": "Cross_Validation <- function(LabelsPath, col_Index = 1,OutputDir){\r\n \"\r\n Cross_Validation\r\n Function returns train an"
},
{
"path": "DEgenesMAST.R",
"chars": 1766,
"preview": "DEgenesMAST <- function(Data, Labels, Normalize = FALSE, LogTransform = FALSE){\r\n # This functions applies a differenti"
},
{
"path": "LICENSE",
"chars": 1066,
"preview": "MIT License\n\nCopyright (c) 2019 tabdelaal\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\n"
},
{
"path": "README.md",
"chars": 5807,
"preview": "# A comparison of automatic cell identification methods for single-cell RNA-sequencing data\nWe present a comprehensive e"
},
{
"path": "Scripts/run_ACTINN.py",
"chars": 4203,
"preview": "import os \r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nimport rpy2.robjects as robjects\r\n\r\ndef run_ACTI"
},
{
"path": "Scripts/run_CHETAH.R",
"chars": 3990,
"preview": "run_CHETAH<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){\r\n \"\r\n run CHETA"
},
{
"path": "Scripts/run_CaSTLe.R",
"chars": 6545,
"preview": "run_CaSTLe<-function(DataPath,LabelsPath,CV_RDataPath, OutputDir, GeneOrderPath = NULL, NumGenes = NULL){\r\n \"\r\n run Ca"
},
{
"path": "Scripts/run_Cell_BLAST.py",
"chars": 4502,
"preview": "import os\r\nimport time as tm\r\nimport pandas as pd\r\nimport warnings\r\nwarnings.filterwarnings(\"ignore\")\r\n\r\nimport tensorfl"
},
{
"path": "Scripts/run_DigitalCellSorter.py",
"chars": 3847,
"preview": "import numpy as np\r\nimport pandas as pd\r\nimport scripts.DigitalCellSorter as DigitalCellSorter\r\nimport os\r\nimport time a"
},
{
"path": "Scripts/run_Garnett_CV.R",
"chars": 5282,
"preview": "run_Garnett_CV <- function(DataPath, LabelsPath, CV_RDataPath, GenesPath, MarkerPath, OutputDir, Human){\r\n \"\r\n run Gar"
},
{
"path": "Scripts/run_Garnett_Pretrained.R",
"chars": 2840,
"preview": "run_Garnett_Pretrained <- function(DataPath, LabelsPath, GenesPath, CV_RDataPath, ClassifierPath, OutputDir, Human){\r\n "
},
{
"path": "Scripts/run_LAmbDA.py",
"chars": 12861,
"preview": "# -*- coding: utf-8 -*-\r\n\"\"\"\r\nCreated on Thu May 23 13:51:15 2019\r\n\r\n@author: Lieke\r\n\"\"\"\r\n\r\nimport os \r\nimport numpy as "
},
{
"path": "Scripts/run_LDA.py",
"chars": 3879,
"preview": "import os \r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nfrom sklearn.discriminant_analysis import Linear"
},
{
"path": "Scripts/run_LDA_rejection.py",
"chars": 4135,
"preview": "import os \r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nfrom sklearn.discriminant_analysis import Linear"
},
{
"path": "Scripts/run_NMC.py",
"chars": 3849,
"preview": "import os \r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nfrom sklearn.neighbors import NearestCentroid\r\ni"
},
{
"path": "Scripts/run_RF.py",
"chars": 3882,
"preview": "import os \r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nfrom sklearn.ensemble import RandomForestClassif"
},
{
"path": "Scripts/run_SCINA.R",
"chars": 2043,
"preview": "run_SCINA<-function(DataPath,LabelsPath,GeneSigPath,OutputDir){\r\n \"\r\n run SCINA\r\n Wrapper script to run SCINA on a be"
},
{
"path": "Scripts/run_SVM.py",
"chars": 3851,
"preview": "import os \r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nfrom sklearn.svm import LinearSVC\r\nimport rpy2.r"
},
{
"path": "Scripts/run_SVM_rejection.py",
"chars": 4180,
"preview": "import os \r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nfrom sklearn.svm import LinearSVC\r\nimport rpy2.r"
},
{
"path": "Scripts/run_SingleR.R",
"chars": 3406,
"preview": "run_SingleR<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){\r\n \"\r\n run Sing"
},
{
"path": "Scripts/run_kNN50.py",
"chars": 3902,
"preview": "import os \r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nfrom sklearn.neighbors import KNeighborsClassifi"
},
{
"path": "Scripts/run_kNN9.py",
"chars": 3891,
"preview": "import os \r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nfrom sklearn.neighbors import KNeighborsClassifi"
},
{
"path": "Scripts/run_moana.py",
"chars": 3560,
"preview": "import os\r\nimport pandas as pd\r\nimport numpy as np\r\nfrom moana.core import ExpMatrix\r\nfrom moana.classify import CellTyp"
},
{
"path": "Scripts/run_scID.R",
"chars": 3580,
"preview": "run_scID<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){\r\n \"\r\n run scID\r\n "
},
{
"path": "Scripts/run_scPred.R",
"chars": 5292,
"preview": "run_scPred<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){\r\n \"\r\n run scPre"
},
{
"path": "Scripts/run_scVI.py",
"chars": 5282,
"preview": "from scvi.dataset import CsvDataset\r\nimport os\r\nimport numpy as np\r\nimport pandas as pd\r\nfrom scvi.models import SCANVI\r"
},
{
"path": "Scripts/run_scmap.R",
"chars": 7372,
"preview": "run_scmap <- function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){\r\n \"\r\n run scma"
},
{
"path": "Scripts/run_singleCellNet.R",
"chars": 4550,
"preview": "run_singleCellNet<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){\r\n \"\r\n ru"
},
{
"path": "Snakemake/Cross_Validation.R",
"chars": 1546,
"preview": "args <- commandArgs(TRUE)\r\n\r\nCross_Validation <- function(LabelsPath, col_Index = 1, OutputDir){\r\n \"\r\n Cross_Validatio"
},
{
"path": "Snakemake/DEgenesMAST.R",
"chars": 1766,
"preview": "DEgenesMAST <- function(Data, Labels, Normalize = FALSE, LogTransform = FALSE){\r\n # This functions applies a differenti"
},
{
"path": "Snakemake/Dockerfiles/baseline/Dockerfile",
"chars": 1148,
"preview": "FROM debian:9.9-slim\n\n# Install newest R version\nRUN apt-get update && \\\n apt-get install --no-install-recommends --y"
},
{
"path": "Snakemake/Dockerfiles/cell_blast/Dockerfile",
"chars": 1109,
"preview": "FROM python:3.7-slim-stretch\n\n# Install newest R version\nRUN apt-get update && \\\n apt-get install --no-install-recomm"
},
{
"path": "Snakemake/Dockerfiles/chetah/Dockerfile",
"chars": 1101,
"preview": "FROM debian:9.9-slim\n\n# Install newest R version\nRUN apt-get update && \\\n apt-get install --no-install-recommends --y"
},
{
"path": "Snakemake/Dockerfiles/chetah/install_packages.R",
"chars": 434,
"preview": "withCallingHandlers({\n install.packages(\"devtools\", repos=\"https://cloud.r-project.org/\")\n install.packages(\"BiocManag"
},
{
"path": "Snakemake/Dockerfiles/cross_validation/Dockerfile",
"chars": 1033,
"preview": "FROM debian:9.9-slim\n\n# Install newest R version\nRUN apt-get update && \\\n apt-get install --no-install-recommends --y"
},
{
"path": "Snakemake/Dockerfiles/cross_validation/install_packages.R",
"chars": 202,
"preview": "withCallingHandlers({\n install.packages(\"lhs\", repos=\"https://cloud.r-project.org/\")\n install.packages(\"rBayesianOptim"
},
{
"path": "Snakemake/Dockerfiles/garnett/Dockerfile",
"chars": 1178,
"preview": "FROM debian:9.9-slim\n\n# Install newest R version\nRUN apt-get update && \\\n apt-get install --no-install-recommends --y"
},
{
"path": "Snakemake/Dockerfiles/garnett/install_packages.R",
"chars": 431,
"preview": "withCallingHandlers({\n install.packages(\"BiocManager\", repos=\"https://cloud.r-project.org/\")\n BiocManager::install(c(\""
},
{
"path": "Snakemake/Dockerfiles/scid/Dockerfile",
"chars": 513,
"preview": "FROM r-base:3.6.0\n\nCOPY Scripts/run_scID.R \\\n Dockerfiles/scid/install_packages.R \\\n /Scripts/\n\n# Install R pack"
},
{
"path": "Snakemake/Dockerfiles/scid/install_packages.R",
"chars": 370,
"preview": "withCallingHandlers({\n install.packages(\"BiocManager\", repos=\"https://cloud.r-project.org/\")\n BiocManager::install(ask"
},
{
"path": "Snakemake/Dockerfiles/scmap/Dockerfile",
"chars": 553,
"preview": "FROM r-base:3.6.0\n\nCOPY Scripts/run_scmapcell.R \\\n Scripts/run_scmapcluster.R \\\n Dockerfiles/scmap/install_packa"
},
{
"path": "Snakemake/Dockerfiles/scmap/install_packages.R",
"chars": 328,
"preview": "withCallingHandlers({\n install.packages(\"BiocManager\", repos=\"https://cloud.r-project.org/\")\n BiocManager::install(ask"
},
{
"path": "Snakemake/Dockerfiles/scvi/Dockerfile",
"chars": 1098,
"preview": "FROM python:3.7-slim-stretch\n\n# Install newest R version\nRUN apt-get update && \\\n apt-get install --no-install-recomm"
},
{
"path": "Snakemake/Dockerfiles/singlecellnet/Dockerfile",
"chars": 1157,
"preview": "FROM debian:9.9-slim\n\n# Install newest R version\nRUN apt-get update && \\\n apt-get install --no-install-recommends --y"
},
{
"path": "Snakemake/Dockerfiles/singlecellnet/install_packages.R",
"chars": 427,
"preview": "withCallingHandlers({\n install.packages(\"devtools\", repos=\"https://cloud.r-project.org/\")\n install.packages(\"BiocManag"
},
{
"path": "Snakemake/Dockerfiles/singler/Dockerfile",
"chars": 1090,
"preview": "FROM debian:9.9-slim\n\n# Install newest R version\nRUN apt-get update && \\\n apt-get install --no-install-recommends --y"
},
{
"path": "Snakemake/Dockerfiles/singler/install_packages.R",
"chars": 287,
"preview": "withCallingHandlers({\n install.packages(\"devtools\", repos=\"https://cloud.r-project.org/\")\n install.packages(\"Seurat\", "
},
{
"path": "Snakemake/LICENSE",
"chars": 1066,
"preview": "MIT License\n\nCopyright (c) 2019 tabdelaal\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\n"
},
{
"path": "Snakemake/README.md",
"chars": 4761,
"preview": "# scRNAseq_Benchmark\nBenchmarking classification tools for scRNA-seq data\n\n## How to use\n[snakemake](https://snakemake.r"
},
{
"path": "Snakemake/Scripts/run_ACTINN.py",
"chars": 4203,
"preview": "import os \r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r\nimport rpy2.robjects as robjects\r\n\r\ndef run_ACTI"
},
{
"path": "Snakemake/Scripts/run_CHETAH.R",
"chars": 3800,
"preview": "args <- commandArgs(TRUE)\r\n\r\nrun_CHETAH<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGen"
},
{
"path": "Snakemake/Scripts/run_CaSTLe.R",
"chars": 6545,
"preview": "run_CaSTLe<-function(DataPath,LabelsPath,CV_RDataPath, OutputDir, GeneOrderPath = NULL, NumGenes = NULL){\r\n \"\r\n run Ca"
},
{
"path": "Snakemake/Scripts/run_Cell_BLAST.py",
"chars": 4237,
"preview": "import os\r\nfrom sys import argv\r\nfrom pathlib import Path\r\nimport time as tm\r\nimport pandas as pd\r\nimport warnings\r\nwarn"
},
{
"path": "Snakemake/Scripts/run_DigitalCellSorter.py",
"chars": 3847,
"preview": "import numpy as np\r\nimport pandas as pd\r\nimport scripts.DigitalCellSorter as DigitalCellSorter\r\nimport os\r\nimport time a"
},
{
"path": "Snakemake/Scripts/run_Garnett_CV.R",
"chars": 5417,
"preview": "args <- commandArgs(TRUE)\r\n\r\nrun_Garnett_CV <- function(DataPath, LabelsPath, CV_RDataPath, GenesPath, MarkerPath, Outpu"
},
{
"path": "Snakemake/Scripts/run_Garnett_Pretrained.R",
"chars": 2983,
"preview": "args <- commandArgs(TRUE)\r\n\r\nrun_Garnett_Pretrained <- function(DataPath, LabelsPath, GenesPath, CV_RDataPath, Classifie"
},
{
"path": "Snakemake/Scripts/run_LAmbDA.py",
"chars": 12861,
"preview": "# -*- coding: utf-8 -*-\r\n\"\"\"\r\nCreated on Thu May 23 13:51:15 2019\r\n\r\n@author: Lieke\r\n\"\"\"\r\n\r\nimport os \r\nimport numpy as "
},
{
"path": "Snakemake/Scripts/run_LDA.py",
"chars": 3578,
"preview": "import os\r\nfrom sys import argv\r\nfrom pathlib import Path\r\n\r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r"
},
{
"path": "Snakemake/Scripts/run_LDA_rejection.py",
"chars": 3876,
"preview": "import os\r\nfrom sys import argv\r\nfrom pathlib import Path\r\n\r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r"
},
{
"path": "Snakemake/Scripts/run_NMC.py",
"chars": 3549,
"preview": "import os\r\nfrom sys import argv\r\nfrom pathlib import Path\r\n\r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r"
},
{
"path": "Snakemake/Scripts/run_RF.py",
"chars": 3583,
"preview": "import os\r\nfrom sys import argv\r\nfrom pathlib import Path\r\n\r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r"
},
{
"path": "Snakemake/Scripts/run_SCINA.R",
"chars": 2043,
"preview": "run_SCINA<-function(DataPath,LabelsPath,GeneSigPath,OutputDir){\r\n \"\r\n run SCINA\r\n Wrapper script to run SCINA on a be"
},
{
"path": "Snakemake/Scripts/run_SVM.py",
"chars": 3549,
"preview": "import os\r\nfrom sys import argv\r\nfrom pathlib import Path\r\n\r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r"
},
{
"path": "Snakemake/Scripts/run_SVM_rejection.py",
"chars": 3919,
"preview": "import os\r\nfrom sys import argv\r\nfrom pathlib import Path\r\n\r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r"
},
{
"path": "Snakemake/Scripts/run_SingleR.R",
"chars": 3199,
"preview": "args <- commandArgs(TRUE)\r\n\r\nrun_SingleR<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGe"
},
{
"path": "Snakemake/Scripts/run_kNN50.py",
"chars": 3593,
"preview": "import os\r\nfrom sys import argv\r\nfrom pathlib import Path\r\n\r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r"
},
{
"path": "Snakemake/Scripts/run_kNN9.py",
"chars": 3586,
"preview": "import os\r\nfrom sys import argv\r\nfrom pathlib import Path\r\n\r\nimport numpy as np\r\nimport pandas as pd\r\nimport time as tm\r"
},
{
"path": "Snakemake/Scripts/run_moana.py",
"chars": 3560,
"preview": "import os\r\nimport pandas as pd\r\nimport numpy as np\r\nfrom moana.core import ExpMatrix\r\nfrom moana.classify import CellTyp"
},
{
"path": "Snakemake/Scripts/run_scID.R",
"chars": 3312,
"preview": "args <- commandArgs(TRUE)\n\nrun_scID<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes ="
},
{
"path": "Snakemake/Scripts/run_scPred.R",
"chars": 5292,
"preview": "run_scPred<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){\r\n \"\r\n run scPre"
},
{
"path": "Snakemake/Scripts/run_scVI.py",
"chars": 5064,
"preview": "from scvi.dataset import CsvDataset\r\nimport os\r\nfrom sys import argv\r\nfrom pathlib import Path\r\nfrom scvi.dataset import"
},
{
"path": "Snakemake/Scripts/run_scmap.R",
"chars": 7372,
"preview": "run_scmap <- function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){\r\n \"\r\n run scma"
},
{
"path": "Snakemake/Scripts/run_scmapcell.R",
"chars": 5025,
"preview": "args <- commandArgs(TRUE)\n\nrun_scmapcell <- function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,Num"
},
{
"path": "Snakemake/Scripts/run_scmapcluster.R",
"chars": 5032,
"preview": "args <- commandArgs(TRUE)\n\nrun_scmapcluster <- function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,"
},
{
"path": "Snakemake/Scripts/run_scmaptotal.R",
"chars": 7372,
"preview": "run_scmap <- function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL,NumGenes = NULL){\r\n \"\r\n run scma"
},
{
"path": "Snakemake/Scripts/run_singleCellNet.R",
"chars": 4195,
"preview": "args <- commandArgs(TRUE)\r\n\r\nrun_singleCellNet<-function(DataPath,LabelsPath,CV_RDataPath,OutputDir,GeneOrderPath = NULL"
},
{
"path": "Snakemake/Snakefile",
"chars": 17062,
"preview": "dockerTag = \"latest\" #FIXME tagged versions\n\ndef feature_ranking(w):\n if \"feature_ranking\" in config.keys():\n "
},
{
"path": "Snakemake/evaluate.R",
"chars": 3235,
"preview": "args <- commandArgs(TRUE)\r\n\r\nTrueLabelsPath <- args[1]\r\nPredLabelsPath <- args[2]\r\nOutputDir <- args[3]\r\nToolName <- arg"
},
{
"path": "Snakemake/example.config.yml",
"chars": 155,
"preview": "output_dir: output\ndatafile: input/data.csv\nlabfile: input/Labels.csv\ncolumn: 1\nnumber_of_features: 0\ntools_to_run:\n - "
},
{
"path": "Snakemake/rank_gene_dropouts.py",
"chars": 2589,
"preview": "import os\r\nfrom sys import argv\r\nfrom pathlib import Path\r\n\r\nimport rpy2.robjects as robjects\r\nimport numpy as np\r\nimpor"
},
{
"path": "evaluate.R",
"chars": 2671,
"preview": "evaluate <- function(TrueLabelsPath, PredLabelsPath, Indices = NULL){\r\n \"\r\n Script to evaluate the performance of the "
},
{
"path": "rank_gene_dropouts.py",
"chars": 2613,
"preview": "import os\r\nimport rpy2.robjects as robjects\r\nimport numpy as np\r\nimport pandas as pd\r\nfrom sklearn import linear_model\r\n"
}
]
About this extraction
This page contains the full source code of the tabdelaal/scRNAseq_Benchmark GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 82 files (288.4 KB), approximately 79.9k tokens, and a symbol index with 42 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.