Repository: SSSYDYSSS/TransProPy
Branch: main
Commit: 2b61483a9cc6
Files: 43
Total size: 74.8 KB
Directory structure:
gitextract_wxsffzrf/
├── .github/
│ └── workflows/
│ └── python-package.yml
├── .gitignore
├── .idea/
│ ├── .gitignore
│ ├── TransProPy.iml
│ ├── inspectionProfiles/
│ │ └── profiles_settings.xml
│ ├── modules.xml
│ └── vcs.xml
├── LICENSE
├── README.md
├── TransProPy/
│ ├── AutoFeatureSelection.py
│ ├── AutogluonSelectML.py
│ ├── AutogluonTimeLimit.py
│ ├── MACFCmain.py
│ ├── NewMACFCmain.py
│ ├── UtilsFunction1/
│ │ ├── Auc.py
│ │ ├── AutoNorm.py
│ │ ├── FeatureRanking.py
│ │ ├── FilterSamples.py
│ │ ├── GeneNames.py
│ │ ├── GeneToFeatureMapping.py
│ │ ├── LoadData.py
│ │ ├── NewFeatureRanking.py
│ │ ├── PrintResults.py
│ │ └── __init__.py
│ ├── UtilsFunction2/
│ │ ├── LogTransform.py
│ │ ├── __init__.py
│ │ └── splitdata.py
│ ├── UtilsFunction3/
│ │ ├── EnsembleForRFE.py
│ │ ├── ExtractAndSaveResults.py
│ │ ├── ExtractCommonSamples.py
│ │ ├── LoadAndPreprocessData.py
│ │ ├── LoadEncodeLabels.py
│ │ ├── LoadFilterTranspose.py
│ │ ├── LoggingCustomScorer.py
│ │ ├── PrintBoxedText.py
│ │ ├── SetupFeatureSelection.py
│ │ ├── SetupLoggingAndProgressBar.py
│ │ ├── TqdmCustomScorer.py
│ │ ├── TrainModel.py
│ │ ├── UpdateProgressBar.py
│ │ └── __init__.py
│ └── __init__.py
└── setup.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/workflows/python-package.yml
================================================
name: Python package
on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
jobs:
build:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install flake8 pytest
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# Lint the code but don't fail the build on errors
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics || true
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
# Run pytest but don't fail the build if no tests are defined or if tests fail
pytest || true
================================================
FILE: .gitignore
================================================
# Created by https://www.toptal.com/developers/gitignore/api/python
# Edit at https://www.toptal.com/developers/gitignore?templates=python
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
### Python Patch ###
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
poetry.toml
# ruff
.ruff_cache/
# LSP config files
pyrightconfig.json
# End of https://www.toptal.com/developers/gitignore/api/python
================================================
FILE: .idea/.gitignore
================================================
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
================================================
FILE: .idea/TransProPy.iml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
<option name="format" value="GOOGLE" />
<option name="myDocStringFormat" value="Google" />
</component>
</module>
================================================
FILE: .idea/inspectionProfiles/profiles_settings.xml
================================================
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
================================================
FILE: .idea/modules.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/TransProPy.iml" filepath="$PROJECT_DIR$/.idea/TransProPy.iml" />
</modules>
</component>
</project>
================================================
FILE: .idea/vcs.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
================================================
FILE: LICENSE
================================================
BSD 3-Clause License
Copyright (c) 2023- Yu Dongyue, SuperOmics All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
================================================
FILE: README.md
================================================
[](https://pypi.org/project/transpropy/) [](https://github.com/SSSYDYSSS/TransProPy/blob/main/LICENSE) [](https://github.com/SSSYDYSSS/TransProPy/actions/workflows/python-package.yml)
# TransProPy
<img src="image/TransProPy_Pylogo.png" alt="TransProPy Logo" width="250" height="250" align="right">
A python package that integrate algorithms and various machine learning approaches to extract features (genes) effective for classification and attribute them accordingly.
## Installation
```bash
pip3 install TransProPy
pip3 install git+https://github.com/SSSYDYSSS/TransProPy.git
```
## Usage
```python
# e.g.:
from TransProCalc import my_function
my_function()
```
## Citation
If you use TransPro in your research, please cite:
Dongyue Yu; Chen Li; Shuo Yan; Lujiale Guo; Jingyu Liang; Shengquan Chen*; Wenjun Bu* (2026). Comparative Evaluation of Differential Gene Selection Methods in Transcriptomics: Bias Correction and Visualization with TransPro. Manuscript in preparation.
**Correspondence:**
Shengquan Chen — School of Mathematical Sciences and LPMC, Nankai University, Tianjin 300071, China.
Wenjun Bu — Institute of Entomology, College of Life Sciences, Nankai University, Tianjin 300071, China.
## More examples see
TransProPy Manual:https://sssydysss.github.io/TransProPyBook/
## Contributing
Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change.
## License
This project is licensed under the BSD 3-Clause License - see the [LICENSE](./LICENSE) file for details.
================================================
FILE: TransProPy/AutoFeatureSelection.py
================================================
import threading
import time
from scipy.stats import reciprocal, randint
from TransProPy.UtilsFunction3.LoadAndPreprocessData import load_and_preprocess_data
from TransProPy.UtilsFunction3.SetupFeatureSelection import setup_feature_selection
from TransProPy.UtilsFunction3.TrainModel import train_model
from TransProPy.UtilsFunction3.ExtractAndSaveResults import extract_and_save_results
from TransProPy.UtilsFunction3.SetupLoggingAndProgressBar import setup_logging_and_progress_bar
from TransProPy.UtilsFunction3.UpdateProgressBar import update_progress_bar
def auto_feature_selection(data_file, label_file, label_col, threshold, show_plot, show_progress, n_iter=5, n_cv=5, n_jobs=9, save_path='../data/', sleep_interval=1, use_tkagg=False):
"""
Run the complete analysis pipeline from data loading to training and result extraction.
Parameters:
- data_file: str, path to the feature data file.
- label_file: str, path to the label data file.
- label_col: str, name of the label column.
- threshold: float, threshold for data preprocessing.
- show_plot: bool, whether to display plot.
- show_progress: bool, whether to show progress bar.
- n_iter: int, number of iterations for RandomizedSearchCV.
- n_cv: int, number of folds for cross-validation.
- n_jobs: int, number of parallel jobs for RandomizedSearchCV.
- save_path: str, path to save results.
- sleep_interval: int, interval time in seconds for progress bar update.
- use_tkagg: bool, whether to use 'TkAgg' backend for matplotlib. Generally, choose False when using in PyCharm IDE, and choose True when rendering file.qmd to an HTML file.
"""
# Load and preprocess data
X, Y = load_and_preprocess_data(data_file, label_file, label_col, threshold)
# Set up feature selection
feature_selection = setup_feature_selection()
# Define parameters for RandomizedSearchCV
parameters = {
'feature_selection__rfecv__estimator__svm__C': reciprocal(0.001, 1000),
'feature_selection__rfecv__estimator__tree__max_depth': randint(2, 10),
'feature_selection__rfecv__estimator__tree__min_samples_split': randint(2, 10),
'feature_selection__rfecv__estimator__gbm__learning_rate': reciprocal(0.01, 0.2),
'feature_selection__rfecv__estimator__gbm__n_estimators': randint(100, 500),
'feature_selection__rfecv__step': randint(10, 150),
'feature_selection__rfecv__min_features_to_select': randint(10, 1000),
'feature_selection__selectkbest__k': randint(10, 200),
'stacking__final_estimator__C': reciprocal(0.001, 1000) # Parameter for logistic regression in stacking classifier
}
# Train the model
clf = train_model(X, Y, feature_selection, parameters, n_iter, n_cv, n_jobs)
# Define a function to run RandomizedSearchCV
def run_randomized_search():
clf.fit(X, Y)
# Initialize tqdm progress bar and logging
if show_progress:
progress_bar = setup_logging_and_progress_bar(n_iter, n_cv)
search_thread = threading.Thread(target=run_randomized_search) # Use threading to run RandomizedSearchCV
search_thread.start()
# Update the progress bar in the main thread
while search_thread.is_alive():
update_progress_bar(progress_bar)
time.sleep(sleep_interval)
# Ensure RandomizedSearchCV completes
search_thread.join() # The main thread will wait for the search thread to complete all search and computation processes before continuing to execute the code after the main thread. This ensures that the main thread continues only after the search process is fully completed and the results are returned.
else:
run_randomized_search()
# Extract and save results
extract_and_save_results(clf, X, Y, save_path, n_cv, show_plot, use_tkagg)
================================================
FILE: TransProPy/AutogluonSelectML.py
================================================
from autogluon.tabular import TabularDataset, TabularPredictor
from TransProPy.UtilsFunction2.splitdata import split_data
def AutoGluon_SelectML(gene_data_path, class_data_path, label_column, test_size, threshold, hyperparameters=None, random_feature=None, num_bag_folds=None, num_stack_levels=None, time_limit=120, random_state=42):
"""
Trains a model using AutoGluon on provided data path and returns feature importance and model leaderboard.
----------------------------------------------------------------------------------------------------------
Parameters:
- gene_data_path (str): Path to the gene expression data CSV file.
For example: '../data/gene_tpm.csv'
- class_data_path (str): Path to the class data CSV file.
For example: '../data/tumor_class.csv'
- label_column (str): Name of the column in the dataset that is the target label for prediction.
- test_size (float): Proportion of the data to be used as the test set.
- threshold (float): The threshold used to filter out rows based on the proportion of non-zero values.
- hyperparameters (dict, optional): Dictionary of hyperparameters for the models.
For example: {'GBM': {}, 'RF': {}}
- random_feature (int, optional): The number of random feature to select. If None, no random feature selection is performed. Default is None.
- num_bag_folds (int, optional):
*Please note: This parameter annotation source can be referred to the documentation link in References.
Number of folds used for bagging of models. When `num_bag_folds = k`, training time is roughly increased by a factor of `k` (set = 0 to disable bagging).
Disabled by default (0), but we recommend values between 5-10 to maximize predictive performance.
Increasing num_bag_folds will result in models with lower bias but that are more prone to overfitting.
`num_bag_folds = 1` is an invalid value, and will raise a ValueError.
Values > 10 may produce diminishing returns, and can even harm overall results due to overfitting.
To further improve predictions, avoid increasing `num_bag_folds` much beyond 10 and instead increase `num_bag_sets`.
default = None
- num_stack_levels (int, optional):
*Please note: This parameter annotation source can be referred to the documentation link in References.
Number of stacking levels to use in stack ensemble. Roughly increases model training time by factor of `num_stack_levels+1` (set = 0 to disable stack ensembling).
Disabled by default (0), but we recommend values between 1-3 to maximize predictive performance.
To prevent overfitting, `num_bag_folds >= 2` must also be set or else a ValueError will be raised.
default = None
- time_limit (int, optional): Time limit for training in seconds.
Default is 120.
- random_state (int, optional): The seed used by the random number generator.
Default is 42.
--------------------------------------------------------------------------------------------
Returns:
- importance (DataFrame): DataFrame containing feature importance.
- leaderboard (DataFrame): DataFrame containing model performance on the test data.
-----------------------------------------------------------------------------------
References:
Scientific Publications:
- AutoGluon-Tabular: Robust and Accurate AutoML for Structured Data (Arxiv, 2020)
- Fast, Accurate, and Simple Models for Tabular Data via Augmented Distillation (NeurIPS, 2020)
- Multimodal AutoML on Structured Tables with Text Fields (ICML AutoML Workshop, 2021)
Articles:
- AutoGluon for tabular data: 3 lines of code to achieve top 1% in Kaggle competitions (AWS Open Source Blog, Mar 2020)
- Accurate image classification in 3 lines of code with AutoGluon (Medium, Feb 2020)
- AutoGluon overview & example applications (Towards Data Science, Dec 2019)
Documentation:
- https://auto.gluon.ai/0.1.0/api/autogluon.predictor.html?highlight=num_bag_folds
--------------------------------------------------------------------------------
"""
train_data, test_data = split_data(gene_data_path, class_data_path, class_name=label_column, test_size=test_size, random_state=random_state, threshold=threshold, random_feature=random_feature)
train_data = TabularDataset(train_data)
test_data = TabularDataset(test_data)
# Train the model using AutoGluon
predictor = TabularPredictor(label=label_column).fit(train_data, hyperparameters=hyperparameters, time_limit=time_limit, num_bag_folds=num_bag_folds, num_stack_levels=num_stack_levels)
# Get the feature importance
importance = predictor.feature_importance(test_data, subsample_size=None)
# Get the leaderboard of models
leaderboard = predictor.leaderboard(test_data)
return importance, leaderboard
================================================
FILE: TransProPy/AutogluonTimeLimit.py
================================================
from autogluon.tabular import TabularDataset, TabularPredictor
from TransProPy.UtilsFunction2.splitdata import split_data
def Autogluon_TimeLimit(gene_data_path, class_data_path, label_column, test_size, threshold, random_feature=None, num_bag_folds=None, num_stack_levels=None, time_limit=120, random_state=42):
"""
2.1_autogluon_time-limit.
Trains a model using AutoGluon on provided data path and returns feature importance and model leaderboard.
----------------------------------------------------------------------------------------------------------
Parameters:
- gene_data_path (str): Path to the gene expression data CSV file.
For example: '../data/gene_tpm.csv'
- class_data_path (str): Path to the class data CSV file.
For example: '../data/tumor_class.csv'
- label_column (str): Name of the column in the dataset that is the target label for prediction.
- test_size (float): Proportion of the data to be used as the test set.
- threshold (float): The threshold used to filter out rows based on the proportion of non-zero values.
- random_feature (int, optional): The number of random feature to select. If None, no random feature selection is performed.
Default is None.
- num_bag_folds (int, optional):
*Please note: This parameter annotation source can be referred to the documentation link in References.
Number of folds used for bagging of models. When `num_bag_folds = k`, training time is roughly increased by a factor of `k` (set = 0 to disable bagging).
Disabled by default (0), but we recommend values between 5-10 to maximize predictive performance.
Increasing num_bag_folds will result in models with lower bias but that are more prone to overfitting.
`num_bag_folds = 1` is an invalid value, and will raise a ValueError.
Values > 10 may produce diminishing returns, and can even harm overall results due to overfitting.
To further improve predictions, avoid increasing `num_bag_folds` much beyond 10 and instead increase `num_bag_sets`.
default = None
- num_stack_levels (int, optional):
*Please note: This parameter annotation source can be referred to the documentation link in References.
Number of stacking levels to use in stack ensemble. Roughly increases model training time by factor of `num_stack_levels+1` (set = 0 to disable stack ensembling).
Disabled by default (0), but we recommend values between 1-3 to maximize predictive performance.
To prevent overfitting, `num_bag_folds >= 2` must also be set or else a ValueError will be raised.
default = None
- time_limit (int, optional): Time limit for training in seconds.
Default is 120.
- random_state (int): The seed used by the random number generator.
Default is 42.
----------------------------------------------------------------------------------
Returns:
- importance (DataFrame): DataFrame containing feature importance.
- leaderboard (DataFrame): DataFrame containing model performance on the test data.
-----------------------------------------------------------------------------------
References:
Scientific Publications:
- AutoGluon-Tabular: Robust and Accurate AutoML for Structured Data (Arxiv, 2020)
- Fast, Accurate, and Simple Models for Tabular Data via Augmented Distillation (NeurIPS, 2020)
- Multimodal AutoML on Structured Tables with Text Fields (ICML AutoML Workshop, 2021)
Articles:
- AutoGluon for tabular data: 3 lines of code to achieve top 1% in Kaggle competitions (AWS Open Source Blog, Mar 2020)
- Accurate image classification in 3 lines of code with AutoGluon (Medium, Feb 2020)
- AutoGluon overview & example applications (Towards Data Science, Dec 2019)
Documentation:
- https://auto.gluon.ai/0.1.0/api/autogluon.predictor.html?highlight=num_bag_folds
--------------------------------------------------------------------------------
"""
train_data, test_data = split_data(gene_data_path, class_data_path, class_name=label_column, test_size=test_size, random_state=random_state, threshold=threshold, random_feature=random_feature)
train_data = TabularDataset(train_data)
test_data = TabularDataset(test_data)
# Train the model using AutoGluon
predictor = TabularPredictor(label=label_column).fit(train_data, time_limit=time_limit, num_bag_folds=num_bag_folds, num_stack_levels=num_stack_levels)
# Get the feature importance
importance = predictor.feature_importance(test_data, subsample_size=None)
# Get the leaderboard of models
leaderboard = predictor.leaderboard(test_data)
return importance, leaderboard
================================================
FILE: TransProPy/MACFCmain.py
================================================
from numpy import *
from TransProPy.UtilsFunction1.LoadData import load_data
from TransProPy.UtilsFunction1.FeatureRanking import feature_ranking
from TransProPy.UtilsFunction1.PrintResults import print_results
from collections import Counter
def MACFCmain(max_rank, lable_name, threshold, data_path='../data/gene_tpm.csv', label_path='../data/tumor_class.csv'):
"""
1.1_feature_ranking_modle.
Applying the MACFC selection for relevant feature genes in classification.
--------------------------------------------------------------------------
Parameters:
max_rank: int
The total number of gene combinations you want to obtain.
lable_name: string
For example: gender, age, altitude, temperature, quality, and other categorical variable names.
data_path: string
For example: '../data/gene_tpm.csv'
Please note: Preprocess the input data in advance to remove samples that contain too many missing values or zeros.
label_path: string
For example: '../data/tumor_class.csv'
Please note: The input sample categories must be in a numerical binary format, such as: 1,2,1,1,2,2,1.
In this case, the numerical values represent the following classifications: 1: male; 2: female.
threshold: float
For example: 0.9
The set threshold indicates the proportion of non-zero value samples to all samples in each feature.
--------------------------------------------------------------------------------------------------------
Returns:
fr: list of strings
representing ranked features.
fre1: dictionary
feature names as keys and their frequencies as values.
frequency: list of tuples
feature names and their frequencies.
The frequency outputs a list sorted by occurrence frequency (in descending order). This list includes only those elements from the dictionary fre1 (which represents the counted frequencies of elements in the original data) that have an occurrence frequency greater than once, along with their frequencies.
len(FName): integer
count of AUC values greater than 0.5.
FName: array of strings
feature names after ranking with AUC > 0.5.
Fauc: array of floats
AUC values corresponding to the ranked feature names.
---------------------------------------------------------
References:
- Su,Y., Du,K., Wang,J., Wei,J. and Liu,J. (2022) Multi-variable AUC for sifting complementary features and its biomedical application. Briefings in Bioinformatics, 23, bbac029.
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
"""
# load data
f, c = load_data(lable_name, threshold, data_path, label_path)
pos, neg = set(c)
n0, n1 = list(c).count(pos), list(c).count(neg)
FName, Fauc, fr, fre = feature_ranking(f, c, max_rank, pos, neg, n0, n1) # Note that here n0 and n1 are passed as parameters.
fre1 = dict(Counter(fre))
fre2 = {key: value for key, value in fre1.items() if value > 1}
frequency = sorted(fre2.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)
# print_results(fr, fre1, frequency, len(FName), FName, Fauc)
return(fr, fre1, frequency, len(FName), FName, Fauc)
================================================
FILE: TransProPy/NewMACFCmain.py
================================================
from numpy import *
from TransProPy.UtilsFunction1.LoadData import load_data
from TransProPy.UtilsFunction1.NewFeatureRanking import new_feature_ranking
from TransProPy.UtilsFunction1.PrintResults import print_results
from collections import Counter
def New_MACFCmain(AUC_threshold, max_rank, lable_name, threshold, data_path='../data/gene_tpm.csv', label_path='../data/tumor_class.csv'):
"""
1.1_feature_ranking_modle.
Applying the MACFC selection for relevant feature genes in classification.
--------------------------------------------------------------------------
Parameters:
AUC_threshold: float
AUC threshold for feature selection. Features with AUC values higher than this threshold are recorded but not used in subsequent calculations.
max_rank: int
The total number of gene combinations you want to obtain.
lable_name: string
For example: gender, age, altitude, temperature, quality, and other categorical variable names.
data_path: string
For example: '../data/gene_tpm.csv'
Please note: Preprocess the input data in advance to remove samples that contain too many missing values or zeros.
label_path: string
For example: '../data/tumor_class.csv'
Please note: The input sample categories must be in a numerical binary format, such as: 1,2,1,1,2,2,1.
In this case, the numerical values represent the following classifications: 1: male; 2: female.
threshold: float
For example: 0.9
The set threshold indicates the proportion of non-zero value samples to all samples in each feature.
--------------------------------------------------------------------------------------------------------
Returns:
high_auc_features: list of tuples
This list contains tuples of feature indices and their corresponding AUC values, where the AUC value is greater than 0.95. Each tuple consists of the feature's index in string format and its AUC value as a float. This signifies that these features are highly predictive, with a strong ability to distinguish between different classes in the classification task.
fr: list of strings
representing ranked features.
fre1: dictionary
feature names as keys and their frequencies as values.
frequency: list of tuples
feature names and their frequencies.
The frequency outputs a list sorted by occurrence frequency (in descending order). This list includes only those elements from the dictionary fre1 (which represents the counted frequencies of elements in the original data) that have an occurrence frequency greater than once, along with their frequencies.
len(FName): integer
count of AUC values greater than 0.5.
FName: array of strings
feature names after ranking with AUC > 0.5.
Fauc: array of floats
AUC values corresponding to the ranked feature names.
---------------------------------------------------------
References:
- Su,Y., Du,K., Wang,J., Wei,J. and Liu,J. (2022) Multi-variable AUC for sifting complementary features and its biomedical application. Briefings in Bioinformatics, 23, bbac029.
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
"""
# load data
f, c = load_data(lable_name, threshold, data_path, label_path)
pos, neg = set(c)
n0, n1 = list(c).count(pos), list(c).count(neg)
high_auc_features, FName, Fauc, fr, fre = new_feature_ranking(f, c, AUC_threshold, max_rank, pos, neg, n0, n1) # Note that here n0 and n1 are passed as parameters.
fre1 = dict(Counter(fre))
fre2 = {key: value for key, value in fre1.items() if value > 1}
frequency = sorted(fre2.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)
# print_results(fr, fre1, frequency, len(FName), FName, Fauc)
return(high_auc_features, fr, fre1, frequency, len(FName), FName, Fauc)
================================================
FILE: TransProPy/UtilsFunction1/Auc.py
================================================
from numpy import *
def auc(tlofe, ne, n0, n1):
lpp = 0
lnp = 0
flag = 0
aac = 0
for i in range(-1, -size(tlofe) - 1, -1):
if tlofe[i] == ne:
if flag == 1:
aac += lnp * lpp
flag = 0
lpp = 0
lnp += 1
else:
if flag == 0:
flag = 1
lpp += 1
aac += lnp * lpp
auc = (n0 * n1 - aac) / (n0 * n1)
return auc
================================================
FILE: TransProPy/UtilsFunction1/AutoNorm.py
================================================
from numpy import *
def auto_norm(data):
# data:(sample,feature)
"""
Normalization Function
The auto_norm function is designed to normalize a two-dimensional array (matrix). The purpose of normalization is generally to bring all features into the same numerical range, facilitating subsequent analysis or model training.
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Parameters:
data: ndarray
Order Requirements for Input Data:
1.This function does indeed have specific requirements for the row and column order of the input matrix data. Rows should represent individual samples, and columns should represent different features. In other words, each row vector represents a sample containing multiple features.
2.Each column of the matrix will be independently normalized, so different features should be placed in separate columns.
-----------------------------------------------------------------------------------------------------------------------------
Returns:
norm_data: ndarray
It is the normalized data.
------------------------------
"""
mins = data.min(0)
maxs = data.max(0)
ranges = maxs - mins
row = data.shape[0]
norm_data = data - tile(mins, (row, 1))
norm_data = norm_data / tile(ranges, (row, 1))
return norm_data
================================================
FILE: TransProPy/UtilsFunction1/FeatureRanking.py
================================================
from numpy import *
from TransProPy.UtilsFunction1.Auc import auc
def feature_ranking(f, c, max_rank, pos, neg, n0, n1):
f_auc = []
f_no = [str(i) for i in range(shape(f)[0])]
f_mtf = full((shape(f)[0], shape(f)[1]), False)
f_ne = []
fl = shape(f_no)[0]
# print('fl ', fl)
for j in range(fl):
argfv = argsort(f[j])
slofe = c[argfv]
ne = slofe[0]
a = auc(slofe, ne, n0, n1)
if a < 0.5:
if slofe[0] == slofe[-1]:
a = 1 - a
if ne == pos:
ne = neg
else:
ne = pos
f_auc.append(a)
f_ne.append(ne)
ml = 1
mr = 1
for i in range(1, size(slofe)):
if slofe[i] == slofe[0]:
ml += 1
else:
break
for i in range(-2, -size(slofe), -1):
if slofe[i] == slofe[-1]:
mr += 1
else:
break
mr = size(slofe) - mr
if slofe[0] == slofe[-1]:
if not slofe[0] == ne:
ml = 0
else:
mr = size(slofe)
f_mtf[j][argfv[ml:mr]] = True
# print(f_auc)
arg_auc = argsort(-array(f_auc))
FName = array(f_no)[arg_auc]
Fvalue = array(f)[arg_auc]
Fauc = array(f_auc)[arg_auc]
Fne = array(f_ne)[arg_auc]
FmTF = array(f_mtf)[arg_auc]
# print('SORT VALUE', Fvalue)
# print('SORT M', FmTF)
# print('SORT NAME', FName)
# print('SORT AUC', Fauc)
kk = 0
slen = 0
Fmcount = ones((len(FmTF[0])))
Fmcount = Fmcount.astype(bool)
for i in range(fl):
if Fauc[i] < 0.5:
kk += 1
Fmcount &= FmTF[i]
if True in Fmcount:
slen += 1
# print('Totally ', kk, ' features with auc under 0.5')
for i in range(fl):
if Fauc[i] < 0.5:
continue
for j in range(i + 1, fl):
if Fauc[j] < 0.5:
continue
nflg = 0
if not ((FmTF[i] & FmTF[j]) == FmTF[i]).all():
if not ((FmTF[i] & FmTF[j]) == FmTF[j]).all():
nflg = 1
if nflg == 0:
if FmTF[i].sum() <= FmTF[j].sum():
Fauc[j] = -2
else:
Fauc[i] = -2
break
ii = []
gg = []
for i in range(fl):
if Fauc[i] > 0.5:
ii.append(i)
else:
gg.append(i)
arg_auc_gg = argsort(-array(f_auc)[gg])
gg = [str(FName[i]) for i in array(gg)[arg_auc_gg]]
# print('Totally ' + str(fl - len(ii)) + ' features are covered and removed.')
# print(gg)
FName = FName[ii]
Fvalue = Fvalue[ii]
Fauc = Fauc[ii]
Fne = Fne[ii]
FmTF = FmTF[ii]
over = 0
if max_rank > len(ii):
over = max_rank - len(ii)
max_rank = len(ii)
# start ranking
rankset = [] # store unique features
ranklist = [] # with overlap
order = 0
while len(rankset) < max_rank:
## start selection
rnk = 2
mv_auc = Fauc[order]
fs = [FName[order]]
cpms = FmTF[order]
fl = shape(FName)[0]
while mv_auc != 1:
ft = 0
temp = 0
for j in range(fl):
if FName[j] not in fs:
tmpFmTF = cpms & FmTF[j]
if not ((FmTF[j] & cpms) == cpms).all():
mauc = 0
for g in fs + [FName[j]]:
fval = Fvalue[argwhere(FName == g)[0][0]][tmpFmTF]
stwlofe = array(c)[tmpFmTF]
argfv = argsort(fval)
slofe = stwlofe[argfv]
tauc = auc(slofe, Fne[argwhere(FName == g)[0][0]], n0, n1)
mauc += tauc
tmpauc = mauc / rnk
if tmpauc > mv_auc:
mv_auc = tmpauc
ft = j
temp = Fauc[j]
elif tmpauc == mv_auc and Fauc[j] > temp:
ft = j
temp = Fauc[j]
if mv_auc == -2 or ft == 0:
break
fs.append(FName[ft])
cpms = cpms & FmTF[ft]
rnk += 1
# print('\nRank-' + str(rnk - 1) + ' mvAUC: ' + str(mv_auc) + ' Feature set:', fs)
for i in fs:
ranklist.append(i)
if i not in rankset:
rankset.append(i)
order += 1
if over != 0:
ranklist = ranklist + list(gg)[:over]
rankset = rankset + list(gg)[:over]
return FName, Fauc, rankset, ranklist
================================================
FILE: TransProPy/UtilsFunction1/FilterSamples.py
================================================
import pandas as pd
def filter_samples(threshold, data_path='../data/gene_tpm.csv'):
"""
Remove samples with high zero expression.
-----------------------------------------
Parameters
data_path: string
For example: '../data/gene_tpm.csv'
Please note: The input data matrix should have genes as rows and samples as columns.
threshold: float
For example: 0.9
The set threshold indicates the proportion of non-zero value samples to all samples in each feature.
--------------------------------------------------------------------------------------------------------
Return
X: pandas.core.frame.DataFrame
-----------------------------------
"""
data = pd.read_csv(data_path, index_col=0, header=0)
# Calculate the count of non-zero values in each row.
non_zero_counts = data.astype(bool).sum(axis=1)
# Set a threshold indicating the proportion of gene expressions that are zeros.
# threshold = 0.9
# Filter rows based on the threshold.
X = data[non_zero_counts / data.shape[1] > threshold]
# Return the result.
return X
================================================
FILE: TransProPy/UtilsFunction1/GeneNames.py
================================================
import os
from pandas import read_csv, merge
def gene_name(data_path='../data/gene_tpm.csv'):
"""
Extract gene_names data.
------------------------
Parameters:
data_path: string
For example: '../data/gene_tpm.csv'
Please note: Preprocess the input data in advance to remove samples that contain too many missing values or zeros.
The input data matrix should have genes as rows and samples as columns.
---------------------------------------------------------------------------
Return:
gene_name: list
---------------------------------------------------------------------------
"""
# Check if the data files exist at the given paths
if not os.path.exists(data_path):
raise FileNotFoundError(
f"The data file was not found at '{data_path}'. Please ensure it's in the correct location.")
# Load data and labels
data = read_csv(data_path, header=0, index_col=0) # Assuming row names are gene names
# Get the gene names directly from the row names (assuming row names are gene names)
gene_names = data.index.tolist()
return gene_names
================================================
FILE: TransProPy/UtilsFunction1/GeneToFeatureMapping.py
================================================
def gene_map_feature(gene_names, ranked_features):
"""
gene map feature.
------------------------
Parameters
gene_names: list
For example: ['GeneA', 'GeneB', 'GeneC', 'GeneD', 'GeneE']
containing strings
ranked_features: list
For example: [2, 0, 1]
containing integers
-----------------------
Return
gene_to_feature_mapping: dictionary
gene_to_feature_mapping is a Python dictionary type. It is used to map gene names to their corresponding feature (or ranked feature) names.
-----------------------------------------------------------------------------------------------------------------------------------------------
"""
gene_to_feature_mapping = {}
for feature_index_str in ranked_features:
feature_index = int(feature_index_str)
if 0 <= feature_index < len(gene_names):
gene_name = gene_names[feature_index]
gene_to_feature_mapping[gene_name] = feature_index
else:
print(f"Invalid feature index: {feature_index}")
return gene_to_feature_mapping
================================================
FILE: TransProPy/UtilsFunction1/LoadData.py
================================================
from pandas import *
from numpy import *
import os
from TransProPy.UtilsFunction1.AutoNorm import auto_norm
from TransProPy.UtilsFunction1.FilterSamples import filter_samples
def load_data(lable_name, threshold, data_path='../data/gene_tpm.csv', label_path='../data/tumor_class.csv'):
"""
Data Reading and Transformation.
Data normalization for constant value
Extract matrix data and categorical data.
---------------------------------------------
Parameters:
lable_name: string
For example: gender, age, altitude, temperature, quality, and other categorical variable names.
data_path: string
For example: '../data/gene_tpm.csv'
Please note: Preprocess the input data in advance to remove samples that contain too many missing values or zeros.
The input data matrix should have genes as rows and samples as columns.
label_path: string
For example: '../data/tumor_class.csv'
Please note: The input CSV data should have rows representing sample names and columns representing class names.
The input sample categories must be in a numerical binary format, such as: 1,2,1,1,2,2,1.
In this case, the numerical values represent the following classifications: 1: male; 2: female.
threshold: float
For example: 0.9
The set threshold indicates the proportion of non-zero value samples to all samples in each feature.
--------------------------------------------------------------------------------------------------------
Returns:
transpose(f): ndarray
A transposed feature-sample matrix.
c: ndarray
A NumPy array containing classification labels.
---------------------------------------------------------
"""
# Check if the data files exist at the given paths
if not os.path.exists(data_path):
raise FileNotFoundError(
f"The data file was not found at '{data_path}'. Please ensure it's in the correct location.")
if not os.path.exists(label_path):
raise FileNotFoundError(
f"The label file was not found at '{label_path}'. Please ensure it's in the correct location.")
# Continue with the rest of your function
data = filter_samples(threshold, data_path)
# data = read_csv(data_path, header=0, index_col=0)
data = data.transpose()
lable = read_csv(label_path, header=0, index_col=0)
lable = lable[lable_name]
data = merge(data, lable, left_index=True, right_index=True)
values = unique(data.values, axis=0)
f = auto_norm(values[:, :-1]) # data normalization for constant value
c = values[:, -1]
return transpose(f), c
================================================
FILE: TransProPy/UtilsFunction1/NewFeatureRanking.py
================================================
from numpy import *
from TransProPy.UtilsFunction1.Auc import auc
def new_feature_ranking(f, c, AUC_threshold, max_rank, pos, neg, n0, n1):
f_auc = []
f_no = [str(i) for i in range(shape(f)[0])]
f_mtf = full((shape(f)[0], shape(f)[1]), False)
f_ne = []
fl = shape(f_no)[0]
# New addition: To store features with AUC greater than AUC_threshold and their AUC values
high_auc_features = []
# Calculate the AUC for each feature
for j in range(fl):
argfv = argsort(f[j])
slofe = c[argfv]
ne = slofe[0]
a = auc(slofe, ne, n0, n1)
if a < 0.5:
if slofe[0] == slofe[-1]:
a = 1 - a
if ne == pos:
ne = neg
else:
ne = pos
f_auc.append(a)
f_ne.append(ne)
# New addition: Check and record features with AUC greater than AUC_threshold.
if a > AUC_threshold:
high_auc_features.append((f_no[j], a))
# Sort high_auc_features by AUC value
high_auc_features = sorted(high_auc_features, key=lambda x: x[1], reverse=True)
ml = 1
mr = 1
for i in range(1, size(slofe)):
if slofe[i] == slofe[0]:
ml += 1
else:
break
for i in range(-2, -size(slofe), -1):
if slofe[i] == slofe[-1]:
mr += 1
else:
break
mr = size(slofe) - mr
if slofe[0] == slofe[-1]:
if not slofe[0] == ne:
ml = 0
else:
mr = size(slofe)
f_mtf[j][argfv[ml:mr]] = True
# New addition: Exclude features with AUC greater than AUC_threshold from the original set.
remaining_indices = [i for i, a in enumerate(f_auc) if a <= AUC_threshold]
remaining_f_no = [f_no[i] for i in remaining_indices]
remaining_f_auc = [f_auc[i] for i in remaining_indices]
remaining_f_mtf = [f_mtf[i] for i in remaining_indices]
remaining_f_ne = [f_ne[i] for i in remaining_indices]
# Update 'fl' to the number of remaining features.
fl = len(remaining_f_no)
# Sort and process the remaining features.
arg_auc = argsort(-array(remaining_f_auc))
FName = array(remaining_f_no)[arg_auc]
Fvalue = array(f)[arg_auc]
Fauc = array(remaining_f_auc)[arg_auc]
Fne = array(remaining_f_ne)[arg_auc]
FmTF = array(remaining_f_mtf)[arg_auc]
kk = 0
slen = 0
Fmcount = ones((len(FmTF[0])))
Fmcount = Fmcount.astype(bool)
for i in range(fl):
if Fauc[i] < 0.5:
kk += 1
Fmcount &= FmTF[i]
if True in Fmcount:
slen += 1
# print('Totally ', kk, ' features with auc under 0.5')
for i in range(fl):
if Fauc[i] < 0.5:
continue
for j in range(i + 1, fl):
if Fauc[j] < 0.5:
continue
nflg = 0
if not ((FmTF[i] & FmTF[j]) == FmTF[i]).all():
if not ((FmTF[i] & FmTF[j]) == FmTF[j]).all():
nflg = 1
if nflg == 0:
if FmTF[i].sum() <= FmTF[j].sum():
Fauc[j] = -2
else:
Fauc[i] = -2
break
ii = []
gg = []
for i in range(fl):
if Fauc[i] > 0.5:
ii.append(i)
else:
gg.append(i)
arg_auc_gg = argsort(-array(f_auc)[gg])
gg = [str(FName[i]) for i in array(gg)[arg_auc_gg]]
# print('Totally ' + str(fl - len(ii)) + ' features are covered and removed.')
# print(gg)
FName = FName[ii]
Fvalue = Fvalue[ii]
Fauc = Fauc[ii]
Fne = Fne[ii]
FmTF = FmTF[ii]
over = 0
if max_rank > len(ii):
over = max_rank - len(ii)
max_rank = len(ii)
# start ranking
rankset = [] # store unique features
ranklist = [] # with overlap
order = 0
while len(rankset) < max_rank:
## start selection
rnk = 2
mv_auc = Fauc[order]
fs = [FName[order]]
cpms = FmTF[order]
fl = shape(FName)[0]
while mv_auc != 1:
ft = 0
temp = 0
for j in range(fl):
if FName[j] not in fs:
tmpFmTF = cpms & FmTF[j]
if not ((FmTF[j] & cpms) == cpms).all():
mauc = 0
for g in fs + [FName[j]]:
fval = Fvalue[argwhere(FName == g)[0][0]][tmpFmTF]
stwlofe = array(c)[tmpFmTF]
argfv = argsort(fval)
slofe = stwlofe[argfv]
tauc = auc(slofe, Fne[argwhere(FName == g)[0][0]], n0, n1)
mauc += tauc
tmpauc = mauc / rnk
if tmpauc > mv_auc:
mv_auc = tmpauc
ft = j
temp = Fauc[j]
elif tmpauc == mv_auc and Fauc[j] > temp:
ft = j
temp = Fauc[j]
if mv_auc == -2 or ft == 0:
break
fs.append(FName[ft])
cpms = cpms & FmTF[ft]
rnk += 1
# print('\nRank-' + str(rnk - 1) + ' mvAUC: ' + str(mv_auc) + ' Feature set:', fs)
for i in fs:
ranklist.append(i)
if i not in rankset:
rankset.append(i)
order += 1
if over != 0:
ranklist = ranklist + list(gg)[:over]
rankset = rankset + list(gg)[:over]
# Return the features with an AUC greater than AUC_threshold, and other ranked and filtered feature information
return high_auc_features, FName, Fauc, rankset, ranklist
================================================
FILE: TransProPy/UtilsFunction1/PrintResults.py
================================================
def print_results(high_auc_features, fr, fre1, frequency, len_FName, FName, Fauc):
print('Ranked features (start from higher rank): ', fr)
print('Features and its frequency: ', fre1)
print('Sorted features with frequency higher than 1: ', frequency)
print('The count of AUC values greater than 0.5: ', len_FName)
print('The list of feature names after ranking (AUC > 0.5): ', FName)
print('The list of AUC values corresponding to the ranked feature names: ', Fauc)
================================================
FILE: TransProPy/UtilsFunction1/__init__.py
================================================
================================================
FILE: TransProPy/UtilsFunction2/LogTransform.py
================================================
import numpy as np
def log_transform(data):
"""
Evaluate and potentially apply log2 transformation to data.
-This function checks data against a set of criteria to determine if a log2 transformation is needed, applying the transformation if necessary.
-----------------------------------------------------------------------------------------------------------------------------------------------
Parameters:
-data (np.ndarray): A numerical numpy array.
------------------------------------------
Returns:
-result(np.ndarray): The original data or the data transformed with log2.
-----------------------------------------------------------------
"""
# Calculate quantiles
qx = np.quantile(data, [0., 0.25, 0.5, 0.75, 0.99, 1.0])
# Define conditions for log transformation
LogC = (qx[4] > 100) or \
(qx[5] - qx[0] > 50 and qx[1] > 0) or \
(qx[1] > 0 and qx[1] < 1 and qx[3] > 1 and qx[3] < 2)
# Apply log transformation based on conditions
if LogC:
data[data <= 0] = np.NaN # Use NaN for non-applicable data
result = np.log2(data)
print("log2 transform finished")
else:
result = data
print("log2 transform not needed")
return result
================================================
FILE: TransProPy/UtilsFunction2/__init__.py
================================================
================================================
FILE: TransProPy/UtilsFunction2/splitdata.py
================================================
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
def split_data(gene_data_path, class_data_path, class_name, test_size=0.2, random_state=42, threshold=0.9, random_feature=None):
"""
Reads the gene expression and class data, processes it, and splits it into training and testing sets.
-----------------------------------------------------------------------------------------------------
Parameters:
- gene_data_path (str): Path to the CSV file containing the gene expression data.
For example: '../data/gene_tpm.csv'
- class_data_path (str): Path to the CSV file containing the class data.
For example: '../data/tumor_class.csv'
- class_name (str): The name of the class column in the class data.
- test_size (float, optional): The proportion of the data to be used as the testing set. Default is 0.2.
- random_state (int, optional): The seed used by the random number generator. Default is 42.
- threshold (float, optional): The threshold used to filter out rows based on the proportion of non-zero values. Default is 0.9.
- random_feature (int, optional): The number of random feature to select. If None, no random feature selection is performed. Default is None.
---------------------------------------------------------------------------------------------------------------------------------------------
Returns:
- train_data (pd.DataFrame): The training data.
- test_data (pd.DataFrame): The testing data.
---------------------------------------------
"""
# Reading the data
X = pd.read_csv(gene_data_path, index_col=0, header=0)
y = pd.read_csv(class_data_path, index_col=0, header=0)
# Finding common sample names between X(column names) and y(row names)
common = X.columns.intersection(y.index)
# Filtering out low-quality data
non_zero_counts = X.astype(bool).sum(axis=1)
X = X[non_zero_counts / X.shape[1] > threshold]
# If random_sample is specified, perform random sampling on X
if random_feature is not None:
X = X.sample(n=random_feature, random_state=random_state)
# Keeping only the common samples in X and y
X = X.loc[:, common]
y = y.loc[common]
# Transposing X and merging it with the specified column from y
X = X.transpose()
Y = y[class_name]
data = pd.merge(X, Y, left_index=True, right_index=True)
# data is a DataFrame containing features and labels
# First, randomize the data
data = shuffle(data, random_state=42)
# Splitting the data into training and validation sets
# Then perform stratified sampling
train_data, test_data = train_test_split(data, test_size=test_size, random_state=random_state, stratify=data[class_name])
return train_data, test_data
================================================
FILE: TransProPy/UtilsFunction3/EnsembleForRFE.py
================================================
from sklearn.base import BaseEstimator
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
class EnsembleForRFE(BaseEstimator):
"""
Ensemble estimator for recursive feature elimination.
Parameters:
- svm_C: float, regularization parameter for SVM.
- tree_max_depth: int, maximum depth of the decision tree.
- tree_min_samples_split: int, minimum number of samples required to split an internal node.
- gbm_learning_rate: float, learning rate for gradient boosting.
- gbm_n_estimators: int, number of boosting stages to be run for gradient boosting.
"""
def __init__(self, svm_C=1.0, tree_max_depth=None,
tree_min_samples_split=2, gbm_learning_rate=0.1,
gbm_n_estimators=100):
# Save passed parameters as class attributes
self.svm_C = svm_C
self.tree_max_depth = tree_max_depth
self.tree_min_samples_split = tree_min_samples_split
self.gbm_learning_rate = gbm_learning_rate
self.gbm_n_estimators = gbm_n_estimators
# Initialize individual models with the specified parameters
self.svm = SVC(kernel="linear", probability=True, C=self.svm_C)
self.tree = DecisionTreeClassifier(max_depth=self.tree_max_depth,
min_samples_split=self.tree_min_samples_split)
self.gbm = GradientBoostingClassifier(learning_rate=self.gbm_learning_rate,
n_estimators=self.gbm_n_estimators)
self.feature_importances_ = None # Initialize feature importances attribute
def fit(self, X, y):
"""
Fit the individual models and compute aggregated feature importances.
Parameters:
- X: DataFrame, Feature dataset with shape (n_samples, n_features).
- y: ndarray, 1-D array of target values with shape (n_samples,).
Returns:
- self: object, Instance of the model.
"""
# Fit individual models
self.svm.fit(X, y)
self.tree.fit(X, y)
self.gbm.fit(X, y)
# Calculate feature importances and store as attributes
svm_importances = np.abs(self.svm.coef_[0])
tree_importances = self.tree.feature_importances_
gbm_importances = self.gbm.feature_importances_
# Average feature importances
self.feature_importances_ = (svm_importances + tree_importances + gbm_importances) / 3
return self
def predict(self, X):
"""
Predict class labels for samples in X using a soft voting mechanism.
Parameters:
- X: DataFrame, Input features.
Returns:
- Predicted class labels.
"""
# Get the probability predictions from individual models
probabilities = np.array([self.svm.predict_proba(X),
self.tree.predict_proba(X),
self.gbm.predict_proba(X)])
# Average probabilities for soft voting
avg_prob = np.mean(probabilities, axis=0)
# Predict class labels based on the highest probability
return np.argmax(avg_prob, axis=1)
def set_params(self, **params):
"""
Set parameters for the ensemble estimator. This will be used by hyperparameter
optimization methods like RandomizedSearchCV to update the parameters of the
individual models.
Parameters:
- **params: Keyword arguments for parameter names and values.
"""
# Update the parameter values based on provided keyword arguments
for key, value in params.items():
if key in ['svm_C', 'tree_max_depth',
'tree_min_samples_split', 'gbm_learning_rate',
'gbm_n_estimators']:
setattr(self, key, value)
# Re-initialize the models with the updated parameters
self.svm = SVC(kernel="linear", probability=True, C=self.svm_C)
self.tree = DecisionTreeClassifier(max_depth=self.tree_max_depth,
min_samples_split=self.tree_min_samples_split)
self.gbm = GradientBoostingClassifier(learning_rate=self.gbm_learning_rate,
n_estimators=self.gbm_n_estimators)
return self
================================================
FILE: TransProPy/UtilsFunction3/ExtractAndSaveResults.py
================================================
# TransProPy.UtilsFunction3.extract_and_save_results.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from TransProPy.UtilsFunction3.PrintBoxedText import print_boxed_text
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
def extract_and_save_results(
clf,
X,
Y,
save_path,
n_cv,
show_plot=False,
use_tkagg=False):
"""
Extract and save various results from the trained model.
Parameters:
- clf: trained model (RandomizedSearchCV object).
- X: DataFrame, feature data used for training.
- save_path: str, base path for saving results.
- show_plot: bool, whether to display the plot.
- use_tkagg: bool, whether to use 'TkAgg' backend for matplotlib. Generally, choose True when using in PyCharm IDE, and choose False when rendering file.qmd to an HTML file.
"""
# Setting the matplotlib backend to 'TkAgg' if specified
if use_tkagg:
import matplotlib
matplotlib.use('TkAgg')
# Extracting cross-validation results
cv_results = clf.cv_results_
mean_test_scores = cv_results['mean_test_score'] # Calculate the average test score for each iteration
n_iterations = len(mean_test_scores)
# Plotting and saving the accuracy per iteration figure
plt.figure(figsize=(6, 4), facecolor='#f0f8fe')
plt.plot(range(1, n_iterations + 1), mean_test_scores, marker='o')
plt.title('Model Accuracy per Iteration')
plt.xlabel('Iteration')
plt.ylabel('Mean Test Accuracy')
plt.grid(True, color='#11479c', alpha=0.2)
# Get the current axes (ax), and set the background color of the plot area to white
ax = plt.gca()
ax.set_facecolor('#e1f0fb')
# Call tight_layout to automatically adjust the layout
plt.tight_layout()
plt.savefig(save_path + "Model_Accuracy_per_Iteration_figure.pdf", format='pdf')
# Optionally display the plot
if show_plot:
plt.show()
# plotting the ROC curve
# Predict probabilities
y_probas = cross_val_predict(clf.best_estimator_, X, Y, cv=StratifiedKFold(n_splits=n_cv), method='predict_proba')
# Take the probability of the positive class
y_scores = y_probas[:, 1]
# Calculate values for the ROC curve
fpr, tpr, thresholds = roc_curve(Y, y_scores)
# Calculate AUC value
roc_auc = roc_auc_score(Y, y_scores)
# Plot and save the ROC curve
plt.figure(figsize=(6, 4), facecolor='#f0f8fe')
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--') # Diagonal line for a random classifier
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
# Get the current axes (ax), and set the background color of the plot area to white
ax = plt.gca()
ax.set_facecolor('#e1f0fb')
# Call tight_layout to automatically adjust the layout
plt.tight_layout()
plt.savefig(save_path + "ROC_Curve_figure.pdf", format='pdf')
# Optionally display the plot
if show_plot:
plt.show()
# Extracting feature selection results
feature_union = clf.best_estimator_.named_steps['feature_selection']
rfecv = feature_union.transformer_list[0][1]
selectkbest = feature_union.transformer_list[1][1]
selected_features_rfecv = rfecv.support_
selected_features_selectkbest = selectkbest.get_support()
# Print selected features
print_boxed_text("Features selected by RFECV:")
print(X.columns[selected_features_rfecv])
print_boxed_text("Features selected by SelectKBest:")
print(X.columns[selected_features_selectkbest])
# Combining and saving selected features
combined_selected_features = np.logical_or(selected_features_rfecv, selected_features_selectkbest)
combined_features_df = pd.DataFrame({'Feature': X.columns[combined_selected_features]})
combined_features_df.to_csv(save_path + 'combined_features.csv', index=False)
print_boxed_text(f"Total number of selected features: {combined_features_df.shape[0]}")
# Extracting and saving EnsembleForRFE feature importances
ensemble_for_rfe = feature_union.transformer_list[0][1].estimator_
feature_importances_ensemble = ensemble_for_rfe.feature_importances_
importances_ensemble = zip(X.columns[selected_features_rfecv], feature_importances_ensemble)
sorted_importances_ensemble = sorted(importances_ensemble, key=lambda x: x[1], reverse=True)
df_importances_ensemble = pd.DataFrame(sorted_importances_ensemble, columns=['Feature', 'Importance'])
df_importances_ensemble.to_csv(save_path + 'ensemble_importances.csv', index=False)
print_boxed_text("Feature Importances from EnsembleForRFE:")
print(df_importances_ensemble)
# Extracting and saving SelectKBest scores
selectkbest_scores = selectkbest.scores_[selected_features_selectkbest]
scores_selectkbest = zip(X.columns[selected_features_selectkbest], selectkbest_scores)
sorted_scores_selectkbest = sorted(scores_selectkbest, key=lambda x: x[1], reverse=True)
df_scores_selectkbest = pd.DataFrame(sorted_scores_selectkbest, columns=['Feature', 'Score'])
df_scores_selectkbest.to_csv(save_path + 'selectkbest_scores.csv', index=False)
print_boxed_text("Scores from SelectKBest:")
print(df_scores_selectkbest)
================================================
FILE: TransProPy/UtilsFunction3/ExtractCommonSamples.py
================================================
def extract_common_samples(X, Y):
"""
Extracts common samples (rows) from two DataFrames based on their indices.
Parameters:
X (pd.DataFrame): First DataFrame.
Y (pd.DataFrame): Second DataFrame.
Returns:
pd.DataFrame, pd.DataFrame: Two DataFrames containing only the rows that are common in both.
"""
# Find common indices
common_indices = X.index.intersection(Y.index)
# Filter both DataFrames to keep only common rows
X_common = X.loc[common_indices]
Y_common = Y.loc[common_indices]
return X_common, Y_common
================================================
FILE: TransProPy/UtilsFunction3/LoadAndPreprocessData.py
================================================
# TransProPy.UtilsFunction3.load_and_preprocess_data.py
from TransProPy.UtilsFunction3.LoadFilterTranspose import load_filter_transpose
from TransProPy.UtilsFunction3.LoadEncodeLabels import load_encode_labels
from TransProPy.UtilsFunction3.ExtractCommonSamples import extract_common_samples
def load_and_preprocess_data(feature_file, label_file, label_column, threshold):
"""
Load and preprocess the data.
Parameters:
- feature_file: str, path to the feature data file.
- label_file: str, path to the label data file.
- label_column: str, column name of the labels in the label file.
- threshold: float, threshold for filtering in load_filter_transpose function.
Returns:
- X: DataFrame, preprocessed feature data.
- Y: ndarray, preprocessed label data.
"""
X = load_filter_transpose(threshold, feature_file) # Load and filter features
Y = load_encode_labels(label_file, label_column) # Load and encode labels
X, Y = extract_common_samples(X, Y) # Extract common samples
Y = Y.values.ravel() # Flatten Y to 1D array
return X, Y
================================================
FILE: TransProPy/UtilsFunction3/LoadEncodeLabels.py
================================================
from sklearn.preprocessing import LabelEncoder
import pandas as pd
def load_encode_labels(file_path, column_name):
"""
Reads a CSV file containing labels and encodes categorical labels in the specified column to numeric labels.
Parameters:
file_path (str): Path to the CSV file containing labels.
column_name (str): Name of the column to be encoded.
Returns:
Y (pd.DataFrame): A DataFrame containing the encoded numeric labels.
"""
# Load the data
y = pd.read_csv(file_path, index_col=0, header=0)
# Check if the specified column exists in the DataFrame
if column_name not in y.columns:
raise ValueError(f"Column '{column_name}' not found in the DataFrame")
# Create an instance of LabelEncoder
le = LabelEncoder()
# Apply LabelEncoder to the specified column
y_encoded = le.fit_transform(y[column_name]) # Many Scikit-learn models require Y to be numerical. Therefore, if Y is categorical, use the fit_transform method of LabelEncoder to convert the character labels of Y into integers.
# Convert the encoded labels back to a DataFrame
Y = pd.DataFrame(y_encoded, index=y.index, columns=[column_name])
return Y
================================================
FILE: TransProPy/UtilsFunction3/LoadFilterTranspose.py
================================================
import pandas as pd
def load_filter_transpose(threshold, data_path='../data/gene_tpm.csv'):
"""
Remove samples with high zero expression.
-----------------------------------------
Parameters
data_path: string
For example: '../data/gene_tpm.csv'
Please note: The input data matrix should have genes as rows and samples as columns.
threshold: float
For example: 0.9
The set threshold indicates the proportion of non-zero value samples to all samples in each feature.
--------------------------------------------------------------------------------------------------------
Return
X: pandas.core.frame.DataFrame
-----------------------------------
"""
data = pd.read_csv(data_path, index_col=0, header=0)
# Calculate the count of non-zero values in each row.
non_zero_counts = data.astype(bool).sum(axis=1)
# Set a threshold indicating the proportion of gene expressions that are zeros.
# threshold = 0.9
# Filter rows based on the threshold.
X = data[non_zero_counts / data.shape[1] > threshold]
X = X.transpose()
# Return the result.
return X
================================================
FILE: TransProPy/UtilsFunction3/LoggingCustomScorer.py
================================================
from sklearn.metrics import accuracy_score
import logging
import time
def logging_custom_scorer(n_iter=10, n_cv=5):
"""
Creates a custom scorer function for use in model evaluation processes.
This scorer logs both the accuracy score and the time taken for each call.
Parameters:
n_iter (int): Number of iterations for the search process. Default is 10.
n_cv (int): Number of cross-validation splits. Default is 5.
Returns:
function: A custom scorer function that logs the accuracy score and time taken for each call.
"""
# Initialize the time for the first call
last_time = time.time()
def custom_scorer(y_true, y_pred):
"""
Inner function to calculate the accuracy score, log it, and measure the time taken.
Parameters:
y_true (array-like): True labels.
y_pred (array-like): Predicted labels by the model.
Returns:
float: The accuracy score.
"""
nonlocal last_time # Reference the last_time from the outer scope
# Record the current time and calculate the elapsed time since the last call
current_time = time.time()
elapsed = current_time - last_time
last_time = current_time # Update last_time for the next call
# Calculate the accuracy score
score = accuracy_score(y_true, y_pred)
# Log the accuracy and the time taken for this scoring iteration
logging.info(f"One scoring iteration completed, accuracy: {score}, time taken: {elapsed:.2f} seconds")
return score
return custom_scorer
================================================
FILE: TransProPy/UtilsFunction3/PrintBoxedText.py
================================================
def print_boxed_text(title):
"""
Prints a title in a boxed format.
This function creates a box around the given title text using hash (#) and
equals (=) symbols. It prints the title with a border on the top and bottom,
making it stand out in the console output.
Parameters:
- title: str, the text to be displayed inside the box.
Returns:
None. This function directly prints the formatted title to the console.
"""
# Create the top and bottom border line of the box.
# The border line consists of a hash symbol, followed by equals symbols
# the length of the title plus two (for padding), and then another hash symbol.
border_line = "#" + "=" * (len(title) + 2) + "#"
# Print the top border line.
print("\n" + border_line)
# Print the title, surrounded by hash symbols and padded with one space on each side.
print(f"# {title} #")
# Print the bottom border line.
print(border_line)
================================================
FILE: TransProPy/UtilsFunction3/SetupFeatureSelection.py
================================================
# TransProPy.UtilsFunction3.setup_feature_selection.py
from sklearn.feature_selection import RFECV, SelectKBest, mutual_info_classif
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import FeatureUnion
from TransProPy.UtilsFunction3.EnsembleForRFE import EnsembleForRFE
def setup_feature_selection():
"""
Set up the feature selection process.
Returns:
- feature_selection: FeatureUnion, combined feature selection process.
"""
ensemble_estimator = EnsembleForRFE()
rfecv = RFECV(estimator=ensemble_estimator, cv=StratifiedKFold(5), scoring='accuracy')
selectkbest = SelectKBest(score_func=mutual_info_classif)
return FeatureUnion([("rfecv", rfecv), ("selectkbest", selectkbest)])
================================================
FILE: TransProPy/UtilsFunction3/SetupLoggingAndProgressBar.py
================================================
import logging
from tqdm import tqdm
def setup_logging_and_progress_bar(n_iter, n_cv):
"""
Set up logging and initialize a tqdm progress bar.
Parameters:
n_iter (int): Number of iterations for RandomizedSearchCV.
n_cv (int): Number of cross-validation folds.
Returns:
tqdm object: An initialized tqdm progress bar.
"""
# Configure basic logging - this time, without filename and filemode
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s: %(message)s')
# Create a file handler for logging to a file
file_handler = logging.FileHandler('progress.log', mode='w')
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s: %(message)s'))
# Create a stream handler for logging to the console
# stream_handler = logging.StreamHandler()
# stream_handler.setLevel(logging.INFO)
# stream_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s: %(message)s'))
# Get the default logger and add the two handlers to it
logger = logging.getLogger()
logger.addHandler(file_handler)
# logger.addHandler(stream_handler)
# Calculate total iterations
total_iterations = n_iter * n_cv
# Initialize and return tqdm progress bar
pbar = tqdm(total=total_iterations, desc='RandomizedSearchCV Progress')
return pbar
================================================
FILE: TransProPy/UtilsFunction3/TqdmCustomScorer.py
================================================
from sklearn.metrics import make_scorer, accuracy_score
from tqdm import tqdm
def tqdm_custom_scorer(n_iter=10, n_cv=5):
"""
This function creates a custom scorer for use in model evaluation processes like
RandomizedSearchCV. It integrates a progress bar to track the evaluation process.
Parameters:
n_iter (int): Number of iterations for the search process. Default is 10.
n_cv (int): Number of cross-validation splits. Default is 5.
Returns:
function: A custom scorer function that can be used with model evaluation methods.
"""
# Initialize a tqdm progress bar with a total count based on the number of iterations and CV splits
pbar = tqdm(total=n_iter * n_cv, desc='RandomizedSearchCV progress')
# Define an inner function that will be used as the scorer
def custom_scorer(y_true, y_pred):
"""
Inner function to calculate the accuracy score and update the progress bar.
Parameters:
y_true (array-like): True labels.
y_pred (array-like): Predicted labels by the model.
Returns:
float: The accuracy score.
"""
# Calculate the accuracy score
score = accuracy_score(y_true, y_pred)
# Update the progress bar
pbar.update()
return score
# Return the custom scorer function
return custom_scorer
================================================
FILE: TransProPy/UtilsFunction3/TrainModel.py
================================================
# TransProPy.UtilsFunction3.train_model.py
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import make_scorer
from TransProPy.UtilsFunction3.LoggingCustomScorer import logging_custom_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
def train_model(X, Y, feature_selection, parameters, n_iter, n_cv, n_jobs=9):
"""
Set up and run the model training process.
Parameters:
- X: DataFrame, feature data.
- Y: ndarray, label data.
- feature_selection: FeatureUnion, the feature selection process.
- parameters: dict, parameters for RandomizedSearchCV.
- n_iter: int, number of iterations for RandomizedSearchCV.
- n_cv: int, number of cross-validation folds.
- n_jobs: int, number of jobs to run in parallel (default is 9).
Returns:
- clf: RandomizedSearchCV object after fitting.
"""
feature_selection_pipeline = Pipeline([
('scale', StandardScaler()),
('feature_selection', feature_selection),
('stacking', StackingClassifier(
estimators=[
('svm', SVC(probability=True)),
('dt', DecisionTreeClassifier()),
('gbm', GradientBoostingClassifier())
],
final_estimator=LogisticRegression()))
])
clf = RandomizedSearchCV(
feature_selection_pipeline,
parameters,
cv=StratifiedKFold(n_splits=n_cv),
scoring=make_scorer(logging_custom_scorer(n_iter=n_iter, n_cv=n_cv)),
n_iter=n_iter,
random_state=0,
error_score='raise',
n_jobs=n_jobs # Use the customizable n_jobs parameter
)
return clf
================================================
FILE: TransProPy/UtilsFunction3/UpdateProgressBar.py
================================================
def update_progress_bar(pbar, log_file='progress.log'):
"""
Read the number of log entries in the log file and update the tqdm progress bar.
Parameters:
pbar (tqdm): The tqdm progress bar object.
log_file (str): Path to the log file, default is 'progress.log'.
"""
def count_logged_iterations():
"""Read and return the number of log entries in the log file."""
with open(log_file, 'r') as file:
return sum(1 for _ in file)
# Read the log file and update the progress bar
logged_iterations = count_logged_iterations()
pbar.update(logged_iterations - pbar.n) # Only increase by the number of new iterations logged
================================================
FILE: TransProPy/UtilsFunction3/__init__.py
================================================
================================================
FILE: TransProPy/__init__.py
================================================
================================================
FILE: setup.py
================================================
from setuptools import setup, find_packages
setup(
name='transpropy',
version='1.0.0',
packages=find_packages(),
install_requires=[
"numpy",
"pandas",
"setuptools",
"scikit-learn",
"tqdm"
],
url='https://github.com/SSSYDYSSS/TransProPy',
author='Yu Dongyue',
author_email='yudongyue@mail.nankai.edu.cn',
description='A collection of deep learning models that integrate algorithms and various machine learning approaches to extract features (genes) effective for classification and attribute them accordingly.'
)
gitextract_wxsffzrf/ ├── .github/ │ └── workflows/ │ └── python-package.yml ├── .gitignore ├── .idea/ │ ├── .gitignore │ ├── TransProPy.iml │ ├── inspectionProfiles/ │ │ └── profiles_settings.xml │ ├── modules.xml │ └── vcs.xml ├── LICENSE ├── README.md ├── TransProPy/ │ ├── AutoFeatureSelection.py │ ├── AutogluonSelectML.py │ ├── AutogluonTimeLimit.py │ ├── MACFCmain.py │ ├── NewMACFCmain.py │ ├── UtilsFunction1/ │ │ ├── Auc.py │ │ ├── AutoNorm.py │ │ ├── FeatureRanking.py │ │ ├── FilterSamples.py │ │ ├── GeneNames.py │ │ ├── GeneToFeatureMapping.py │ │ ├── LoadData.py │ │ ├── NewFeatureRanking.py │ │ ├── PrintResults.py │ │ └── __init__.py │ ├── UtilsFunction2/ │ │ ├── LogTransform.py │ │ ├── __init__.py │ │ └── splitdata.py │ ├── UtilsFunction3/ │ │ ├── EnsembleForRFE.py │ │ ├── ExtractAndSaveResults.py │ │ ├── ExtractCommonSamples.py │ │ ├── LoadAndPreprocessData.py │ │ ├── LoadEncodeLabels.py │ │ ├── LoadFilterTranspose.py │ │ ├── LoggingCustomScorer.py │ │ ├── PrintBoxedText.py │ │ ├── SetupFeatureSelection.py │ │ ├── SetupLoggingAndProgressBar.py │ │ ├── TqdmCustomScorer.py │ │ ├── TrainModel.py │ │ ├── UpdateProgressBar.py │ │ └── __init__.py │ └── __init__.py └── setup.py
SYMBOL INDEX (33 symbols across 29 files)
FILE: TransProPy/AutoFeatureSelection.py
function auto_feature_selection (line 11) | def auto_feature_selection(data_file, label_file, label_col, threshold, ...
FILE: TransProPy/AutogluonSelectML.py
function AutoGluon_SelectML (line 4) | def AutoGluon_SelectML(gene_data_path, class_data_path, label_column, te...
FILE: TransProPy/AutogluonTimeLimit.py
function Autogluon_TimeLimit (line 3) | def Autogluon_TimeLimit(gene_data_path, class_data_path, label_column, t...
FILE: TransProPy/MACFCmain.py
function MACFCmain (line 7) | def MACFCmain(max_rank, lable_name, threshold, data_path='../data/gene_t...
FILE: TransProPy/NewMACFCmain.py
function New_MACFCmain (line 7) | def New_MACFCmain(AUC_threshold, max_rank, lable_name, threshold, data_p...
FILE: TransProPy/UtilsFunction1/Auc.py
function auc (line 2) | def auc(tlofe, ne, n0, n1):
FILE: TransProPy/UtilsFunction1/AutoNorm.py
function auto_norm (line 2) | def auto_norm(data):
FILE: TransProPy/UtilsFunction1/FeatureRanking.py
function feature_ranking (line 3) | def feature_ranking(f, c, max_rank, pos, neg, n0, n1):
FILE: TransProPy/UtilsFunction1/FilterSamples.py
function filter_samples (line 3) | def filter_samples(threshold, data_path='../data/gene_tpm.csv'):
FILE: TransProPy/UtilsFunction1/GeneNames.py
function gene_name (line 5) | def gene_name(data_path='../data/gene_tpm.csv'):
FILE: TransProPy/UtilsFunction1/GeneToFeatureMapping.py
function gene_map_feature (line 1) | def gene_map_feature(gene_names, ranked_features):
FILE: TransProPy/UtilsFunction1/LoadData.py
function load_data (line 6) | def load_data(lable_name, threshold, data_path='../data/gene_tpm.csv', l...
FILE: TransProPy/UtilsFunction1/NewFeatureRanking.py
function new_feature_ranking (line 4) | def new_feature_ranking(f, c, AUC_threshold, max_rank, pos, neg, n0, n1):
FILE: TransProPy/UtilsFunction1/PrintResults.py
function print_results (line 1) | def print_results(high_auc_features, fr, fre1, frequency, len_FName, FNa...
FILE: TransProPy/UtilsFunction2/LogTransform.py
function log_transform (line 3) | def log_transform(data):
FILE: TransProPy/UtilsFunction2/splitdata.py
function split_data (line 5) | def split_data(gene_data_path, class_data_path, class_name, test_size=0....
FILE: TransProPy/UtilsFunction3/EnsembleForRFE.py
class EnsembleForRFE (line 8) | class EnsembleForRFE(BaseEstimator):
method __init__ (line 20) | def __init__(self, svm_C=1.0, tree_max_depth=None,
method fit (line 39) | def fit(self, X, y):
method predict (line 64) | def predict(self, X):
method set_params (line 85) | def set_params(self, **params):
FILE: TransProPy/UtilsFunction3/ExtractAndSaveResults.py
function extract_and_save_results (line 10) | def extract_and_save_results(
FILE: TransProPy/UtilsFunction3/ExtractCommonSamples.py
function extract_common_samples (line 1) | def extract_common_samples(X, Y):
FILE: TransProPy/UtilsFunction3/LoadAndPreprocessData.py
function load_and_preprocess_data (line 8) | def load_and_preprocess_data(feature_file, label_file, label_column, thr...
FILE: TransProPy/UtilsFunction3/LoadEncodeLabels.py
function load_encode_labels (line 4) | def load_encode_labels(file_path, column_name):
FILE: TransProPy/UtilsFunction3/LoadFilterTranspose.py
function load_filter_transpose (line 3) | def load_filter_transpose(threshold, data_path='../data/gene_tpm.csv'):
FILE: TransProPy/UtilsFunction3/LoggingCustomScorer.py
function logging_custom_scorer (line 5) | def logging_custom_scorer(n_iter=10, n_cv=5):
FILE: TransProPy/UtilsFunction3/PrintBoxedText.py
function print_boxed_text (line 1) | def print_boxed_text(title):
FILE: TransProPy/UtilsFunction3/SetupFeatureSelection.py
function setup_feature_selection (line 9) | def setup_feature_selection():
FILE: TransProPy/UtilsFunction3/SetupLoggingAndProgressBar.py
function setup_logging_and_progress_bar (line 4) | def setup_logging_and_progress_bar(n_iter, n_cv):
FILE: TransProPy/UtilsFunction3/TqdmCustomScorer.py
function tqdm_custom_scorer (line 4) | def tqdm_custom_scorer(n_iter=10, n_cv=5):
FILE: TransProPy/UtilsFunction3/TrainModel.py
function train_model (line 14) | def train_model(X, Y, feature_selection, parameters, n_iter, n_cv, n_job...
FILE: TransProPy/UtilsFunction3/UpdateProgressBar.py
function update_progress_bar (line 1) | def update_progress_bar(pbar, log_file='progress.log'):
Condensed preview — 43 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (82K chars).
[
{
"path": ".github/workflows/python-package.yml",
"chars": 1082,
"preview": "name: Python package\n\non:\n push:\n branches: [ \"main\" ]\n pull_request:\n branches: [ \"main\" ]\n\njobs:\n build:\n\n "
},
{
"path": ".gitignore",
"chars": 3493,
"preview": "# Created by https://www.toptal.com/developers/gitignore/api/python\n# Edit at https://www.toptal.com/developers/gitignor"
},
{
"path": ".idea/.gitignore",
"chars": 176,
"preview": "# Default ignored files\n/shelf/\n/workspace.xml\n# Editor-based HTTP Client requests\n/httpRequests/\n# Datasource local sto"
},
{
"path": ".idea/TransProPy.iml",
"chars": 443,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<module type=\"PYTHON_MODULE\" version=\"4\">\n <component name=\"NewModuleRootManager"
},
{
"path": ".idea/inspectionProfiles/profiles_settings.xml",
"chars": 174,
"preview": "<component name=\"InspectionProjectProfileManager\">\n <settings>\n <option name=\"USE_PROJECT_PROFILE\" value=\"false\" />\n"
},
{
"path": ".idea/modules.xml",
"chars": 272,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n <component name=\"ProjectModuleManager\">\n <modules>\n "
},
{
"path": ".idea/vcs.xml",
"chars": 180,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n <component name=\"VcsDirectoryMappings\">\n <mapping dire"
},
{
"path": "LICENSE",
"chars": 1521,
"preview": "BSD 3-Clause License\n\nCopyright (c) 2023- Yu Dongyue, SuperOmics All rights reserved.\n\nRedistribution and use in source "
},
{
"path": "README.md",
"chars": 1785,
"preview": "[](https://pypi.org/project/transpropy/) [:\n lpp = 0\n lnp = 0\n flag = 0\n aac = 0\n for i in range(-1, "
},
{
"path": "TransProPy/UtilsFunction1/AutoNorm.py",
"chars": 1527,
"preview": "from numpy import *\ndef auto_norm(data):\n # data:(sample,feature)\n \"\"\"\n Normalization Function\n The auto"
},
{
"path": "TransProPy/UtilsFunction1/FeatureRanking.py",
"chars": 4812,
"preview": "from numpy import *\nfrom TransProPy.UtilsFunction1.Auc import auc\ndef feature_ranking(f, c, max_rank, pos, neg, n0, n1):"
},
{
"path": "TransProPy/UtilsFunction1/FilterSamples.py",
"chars": 1133,
"preview": "import pandas as pd\n\ndef filter_samples(threshold, data_path='../data/gene_tpm.csv'):\n \"\"\"\n Remove samples with hi"
},
{
"path": "TransProPy/UtilsFunction1/GeneNames.py",
"chars": 1146,
"preview": "import os\nfrom pandas import read_csv, merge\n\n\ndef gene_name(data_path='../data/gene_tpm.csv'):\n \"\"\"\n Extract gene"
},
{
"path": "TransProPy/UtilsFunction1/GeneToFeatureMapping.py",
"chars": 1111,
"preview": "def gene_map_feature(gene_names, ranked_features):\n \"\"\"\n gene map feature.\n ------------------------\n Parame"
},
{
"path": "TransProPy/UtilsFunction1/LoadData.py",
"chars": 2677,
"preview": "from pandas import *\nfrom numpy import *\nimport os\nfrom TransProPy.UtilsFunction1.AutoNorm import auto_norm\nfrom TransPr"
},
{
"path": "TransProPy/UtilsFunction1/NewFeatureRanking.py",
"chars": 5863,
"preview": "from numpy import *\nfrom TransProPy.UtilsFunction1.Auc import auc\n\ndef new_feature_ranking(f, c, AUC_threshold, max_rank"
},
{
"path": "TransProPy/UtilsFunction1/PrintResults.py",
"chars": 490,
"preview": "def print_results(high_auc_features, fr, fre1, frequency, len_FName, FName, Fauc):\n\n print('Ranked features (start fr"
},
{
"path": "TransProPy/UtilsFunction1/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "TransProPy/UtilsFunction2/LogTransform.py",
"chars": 1273,
"preview": "import numpy as np\n\ndef log_transform(data):\n \"\"\"\n Evaluate and potentially apply log2 transformation to data.\n "
},
{
"path": "TransProPy/UtilsFunction2/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "TransProPy/UtilsFunction2/splitdata.py",
"chars": 2837,
"preview": "import pandas as pd\nfrom sklearn.utils import shuffle\nfrom sklearn.model_selection import train_test_split\n\ndef split_da"
},
{
"path": "TransProPy/UtilsFunction3/EnsembleForRFE.py",
"chars": 4416,
"preview": "from sklearn.base import BaseEstimator\nfrom sklearn.svm import SVC\nfrom sklearn.tree import DecisionTreeClassifier\nfrom "
},
{
"path": "TransProPy/UtilsFunction3/ExtractAndSaveResults.py",
"chars": 5458,
"preview": "# TransProPy.UtilsFunction3.extract_and_save_results.py\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot "
},
{
"path": "TransProPy/UtilsFunction3/ExtractCommonSamples.py",
"chars": 572,
"preview": "def extract_common_samples(X, Y):\n \"\"\"\n Extracts common samples (rows) from two DataFrames based on their indices."
},
{
"path": "TransProPy/UtilsFunction3/LoadAndPreprocessData.py",
"chars": 1102,
"preview": "# TransProPy.UtilsFunction3.load_and_preprocess_data.py\n\nfrom TransProPy.UtilsFunction3.LoadFilterTranspose import load_"
},
{
"path": "TransProPy/UtilsFunction3/LoadEncodeLabels.py",
"chars": 1207,
"preview": "from sklearn.preprocessing import LabelEncoder\nimport pandas as pd\n\ndef load_encode_labels(file_path, column_name):\n "
},
{
"path": "TransProPy/UtilsFunction3/LoadFilterTranspose.py",
"chars": 1162,
"preview": "import pandas as pd\n\ndef load_filter_transpose(threshold, data_path='../data/gene_tpm.csv'):\n \"\"\"\n Remove samples "
},
{
"path": "TransProPy/UtilsFunction3/LoggingCustomScorer.py",
"chars": 1597,
"preview": "from sklearn.metrics import accuracy_score\nimport logging\nimport time\n\ndef logging_custom_scorer(n_iter=10, n_cv=5):\n "
},
{
"path": "TransProPy/UtilsFunction3/PrintBoxedText.py",
"chars": 967,
"preview": "def print_boxed_text(title):\n \"\"\"\n Prints a title in a boxed format.\n\n This function creates a box around the g"
},
{
"path": "TransProPy/UtilsFunction3/SetupFeatureSelection.py",
"chars": 745,
"preview": "# TransProPy.UtilsFunction3.setup_feature_selection.py\n\nfrom sklearn.feature_selection import RFECV, SelectKBest, mutual"
},
{
"path": "TransProPy/UtilsFunction3/SetupLoggingAndProgressBar.py",
"chars": 1422,
"preview": "import logging\nfrom tqdm import tqdm\n\ndef setup_logging_and_progress_bar(n_iter, n_cv):\n \"\"\"\n Set up logging and i"
},
{
"path": "TransProPy/UtilsFunction3/TqdmCustomScorer.py",
"chars": 1368,
"preview": "from sklearn.metrics import make_scorer, accuracy_score\nfrom tqdm import tqdm\n\ndef tqdm_custom_scorer(n_iter=10, n_cv=5)"
},
{
"path": "TransProPy/UtilsFunction3/TrainModel.py",
"chars": 1947,
"preview": "# TransProPy.UtilsFunction3.train_model.py\n\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.model_selection import St"
},
{
"path": "TransProPy/UtilsFunction3/UpdateProgressBar.py",
"chars": 684,
"preview": "def update_progress_bar(pbar, log_file='progress.log'):\n \"\"\"\n Read the number of log entries in the log file and u"
},
{
"path": "TransProPy/UtilsFunction3/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "TransProPy/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "setup.py",
"chars": 588,
"preview": "from setuptools import setup, find_packages\n\nsetup(\n name='transpropy',\n version='1.0.0',\n packages=find_packag"
}
]
About this extraction
This page contains the full source code of the SSSYDYSSS/TransProPy GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 43 files (74.8 KB), approximately 19.2k tokens, and a symbol index with 33 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.