Repository: SSSYDYSSS/TransProPy Branch: main Commit: 2b61483a9cc6 Files: 43 Total size: 74.8 KB Directory structure: gitextract_wxsffzrf/ ├── .github/ │ └── workflows/ │ └── python-package.yml ├── .gitignore ├── .idea/ │ ├── .gitignore │ ├── TransProPy.iml │ ├── inspectionProfiles/ │ │ └── profiles_settings.xml │ ├── modules.xml │ └── vcs.xml ├── LICENSE ├── README.md ├── TransProPy/ │ ├── AutoFeatureSelection.py │ ├── AutogluonSelectML.py │ ├── AutogluonTimeLimit.py │ ├── MACFCmain.py │ ├── NewMACFCmain.py │ ├── UtilsFunction1/ │ │ ├── Auc.py │ │ ├── AutoNorm.py │ │ ├── FeatureRanking.py │ │ ├── FilterSamples.py │ │ ├── GeneNames.py │ │ ├── GeneToFeatureMapping.py │ │ ├── LoadData.py │ │ ├── NewFeatureRanking.py │ │ ├── PrintResults.py │ │ └── __init__.py │ ├── UtilsFunction2/ │ │ ├── LogTransform.py │ │ ├── __init__.py │ │ └── splitdata.py │ ├── UtilsFunction3/ │ │ ├── EnsembleForRFE.py │ │ ├── ExtractAndSaveResults.py │ │ ├── ExtractCommonSamples.py │ │ ├── LoadAndPreprocessData.py │ │ ├── LoadEncodeLabels.py │ │ ├── LoadFilterTranspose.py │ │ ├── LoggingCustomScorer.py │ │ ├── PrintBoxedText.py │ │ ├── SetupFeatureSelection.py │ │ ├── SetupLoggingAndProgressBar.py │ │ ├── TqdmCustomScorer.py │ │ ├── TrainModel.py │ │ ├── UpdateProgressBar.py │ │ └── __init__.py │ └── __init__.py └── setup.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/python-package.yml ================================================ name: Python package on: push: branches: [ "main" ] pull_request: branches: [ "main" ] jobs: build: runs-on: ubuntu-latest strategy: fail-fast: false matrix: python-version: ["3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v3 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip python -m pip install flake8 pytest if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Lint with flake8 run: | # Lint the code but don't fail the build on errors flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics || true flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Test with pytest run: | # Run pytest but don't fail the build if no tests are defined or if tests fail pytest || true ================================================ FILE: .gitignore ================================================ # Created by https://www.toptal.com/developers/gitignore/api/python # Edit at https://www.toptal.com/developers/gitignore?templates=python ### Python ### # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. #pdm.lock # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it # in version control. # https://pdm.fming.dev/#use-with-ide .pdm.toml # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ ### Python Patch ### # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration poetry.toml # ruff .ruff_cache/ # LSP config files pyrightconfig.json # End of https://www.toptal.com/developers/gitignore/api/python ================================================ FILE: .idea/.gitignore ================================================ # Default ignored files /shelf/ /workspace.xml # Editor-based HTTP Client requests /httpRequests/ # Datasource local storage ignored files /dataSources/ /dataSources.local.xml ================================================ FILE: .idea/TransProPy.iml ================================================ ================================================ FILE: .idea/inspectionProfiles/profiles_settings.xml ================================================ ================================================ FILE: .idea/modules.xml ================================================ ================================================ FILE: .idea/vcs.xml ================================================ ================================================ FILE: LICENSE ================================================ BSD 3-Clause License Copyright (c) 2023- Yu Dongyue, SuperOmics All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: README.md ================================================ [![pypi-badge](https://img.shields.io/pypi/v/transpropy)](https://pypi.org/project/transpropy/) [![License](https://img.shields.io/github/license/SSSYDYSSS/TransProPy)](https://github.com/SSSYDYSSS/TransProPy/blob/main/LICENSE) [![Build Status](https://github.com/SSSYDYSSS/TransProPy/actions/workflows/python-package.yml/badge.svg)](https://github.com/SSSYDYSSS/TransProPy/actions/workflows/python-package.yml) # TransProPy TransProPy Logo A python package that integrate algorithms and various machine learning approaches to extract features (genes) effective for classification and attribute them accordingly. ## Installation ```bash pip3 install TransProPy pip3 install git+https://github.com/SSSYDYSSS/TransProPy.git ``` ## Usage ```python # e.g.: from TransProCalc import my_function my_function() ``` ## Citation If you use TransPro in your research, please cite: Dongyue Yu; Chen Li; Shuo Yan; Lujiale Guo; Jingyu Liang; Shengquan Chen*; Wenjun Bu* (2026). Comparative Evaluation of Differential Gene Selection Methods in Transcriptomics: Bias Correction and Visualization with TransPro. Manuscript in preparation. **Correspondence:** Shengquan Chen — School of Mathematical Sciences and LPMC, Nankai University, Tianjin 300071, China. Wenjun Bu — Institute of Entomology, College of Life Sciences, Nankai University, Tianjin 300071, China. ## More examples see TransProPy Manual:https://sssydysss.github.io/TransProPyBook/ ## Contributing Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change. ## License This project is licensed under the BSD 3-Clause License - see the [LICENSE](./LICENSE) file for details. ================================================ FILE: TransProPy/AutoFeatureSelection.py ================================================ import threading import time from scipy.stats import reciprocal, randint from TransProPy.UtilsFunction3.LoadAndPreprocessData import load_and_preprocess_data from TransProPy.UtilsFunction3.SetupFeatureSelection import setup_feature_selection from TransProPy.UtilsFunction3.TrainModel import train_model from TransProPy.UtilsFunction3.ExtractAndSaveResults import extract_and_save_results from TransProPy.UtilsFunction3.SetupLoggingAndProgressBar import setup_logging_and_progress_bar from TransProPy.UtilsFunction3.UpdateProgressBar import update_progress_bar def auto_feature_selection(data_file, label_file, label_col, threshold, show_plot, show_progress, n_iter=5, n_cv=5, n_jobs=9, save_path='../data/', sleep_interval=1, use_tkagg=False): """ Run the complete analysis pipeline from data loading to training and result extraction. Parameters: - data_file: str, path to the feature data file. - label_file: str, path to the label data file. - label_col: str, name of the label column. - threshold: float, threshold for data preprocessing. - show_plot: bool, whether to display plot. - show_progress: bool, whether to show progress bar. - n_iter: int, number of iterations for RandomizedSearchCV. - n_cv: int, number of folds for cross-validation. - n_jobs: int, number of parallel jobs for RandomizedSearchCV. - save_path: str, path to save results. - sleep_interval: int, interval time in seconds for progress bar update. - use_tkagg: bool, whether to use 'TkAgg' backend for matplotlib. Generally, choose False when using in PyCharm IDE, and choose True when rendering file.qmd to an HTML file. """ # Load and preprocess data X, Y = load_and_preprocess_data(data_file, label_file, label_col, threshold) # Set up feature selection feature_selection = setup_feature_selection() # Define parameters for RandomizedSearchCV parameters = { 'feature_selection__rfecv__estimator__svm__C': reciprocal(0.001, 1000), 'feature_selection__rfecv__estimator__tree__max_depth': randint(2, 10), 'feature_selection__rfecv__estimator__tree__min_samples_split': randint(2, 10), 'feature_selection__rfecv__estimator__gbm__learning_rate': reciprocal(0.01, 0.2), 'feature_selection__rfecv__estimator__gbm__n_estimators': randint(100, 500), 'feature_selection__rfecv__step': randint(10, 150), 'feature_selection__rfecv__min_features_to_select': randint(10, 1000), 'feature_selection__selectkbest__k': randint(10, 200), 'stacking__final_estimator__C': reciprocal(0.001, 1000) # Parameter for logistic regression in stacking classifier } # Train the model clf = train_model(X, Y, feature_selection, parameters, n_iter, n_cv, n_jobs) # Define a function to run RandomizedSearchCV def run_randomized_search(): clf.fit(X, Y) # Initialize tqdm progress bar and logging if show_progress: progress_bar = setup_logging_and_progress_bar(n_iter, n_cv) search_thread = threading.Thread(target=run_randomized_search) # Use threading to run RandomizedSearchCV search_thread.start() # Update the progress bar in the main thread while search_thread.is_alive(): update_progress_bar(progress_bar) time.sleep(sleep_interval) # Ensure RandomizedSearchCV completes search_thread.join() # The main thread will wait for the search thread to complete all search and computation processes before continuing to execute the code after the main thread. This ensures that the main thread continues only after the search process is fully completed and the results are returned. else: run_randomized_search() # Extract and save results extract_and_save_results(clf, X, Y, save_path, n_cv, show_plot, use_tkagg) ================================================ FILE: TransProPy/AutogluonSelectML.py ================================================ from autogluon.tabular import TabularDataset, TabularPredictor from TransProPy.UtilsFunction2.splitdata import split_data def AutoGluon_SelectML(gene_data_path, class_data_path, label_column, test_size, threshold, hyperparameters=None, random_feature=None, num_bag_folds=None, num_stack_levels=None, time_limit=120, random_state=42): """ Trains a model using AutoGluon on provided data path and returns feature importance and model leaderboard. ---------------------------------------------------------------------------------------------------------- Parameters: - gene_data_path (str): Path to the gene expression data CSV file. For example: '../data/gene_tpm.csv' - class_data_path (str): Path to the class data CSV file. For example: '../data/tumor_class.csv' - label_column (str): Name of the column in the dataset that is the target label for prediction. - test_size (float): Proportion of the data to be used as the test set. - threshold (float): The threshold used to filter out rows based on the proportion of non-zero values. - hyperparameters (dict, optional): Dictionary of hyperparameters for the models. For example: {'GBM': {}, 'RF': {}} - random_feature (int, optional): The number of random feature to select. If None, no random feature selection is performed. Default is None. - num_bag_folds (int, optional): *Please note: This parameter annotation source can be referred to the documentation link in References. Number of folds used for bagging of models. When `num_bag_folds = k`, training time is roughly increased by a factor of `k` (set = 0 to disable bagging). Disabled by default (0), but we recommend values between 5-10 to maximize predictive performance. Increasing num_bag_folds will result in models with lower bias but that are more prone to overfitting. `num_bag_folds = 1` is an invalid value, and will raise a ValueError. Values > 10 may produce diminishing returns, and can even harm overall results due to overfitting. To further improve predictions, avoid increasing `num_bag_folds` much beyond 10 and instead increase `num_bag_sets`. default = None - num_stack_levels (int, optional): *Please note: This parameter annotation source can be referred to the documentation link in References. Number of stacking levels to use in stack ensemble. Roughly increases model training time by factor of `num_stack_levels+1` (set = 0 to disable stack ensembling). Disabled by default (0), but we recommend values between 1-3 to maximize predictive performance. To prevent overfitting, `num_bag_folds >= 2` must also be set or else a ValueError will be raised. default = None - time_limit (int, optional): Time limit for training in seconds. Default is 120. - random_state (int, optional): The seed used by the random number generator. Default is 42. -------------------------------------------------------------------------------------------- Returns: - importance (DataFrame): DataFrame containing feature importance. - leaderboard (DataFrame): DataFrame containing model performance on the test data. ----------------------------------------------------------------------------------- References: Scientific Publications: - AutoGluon-Tabular: Robust and Accurate AutoML for Structured Data (Arxiv, 2020) - Fast, Accurate, and Simple Models for Tabular Data via Augmented Distillation (NeurIPS, 2020) - Multimodal AutoML on Structured Tables with Text Fields (ICML AutoML Workshop, 2021) Articles: - AutoGluon for tabular data: 3 lines of code to achieve top 1% in Kaggle competitions (AWS Open Source Blog, Mar 2020) - Accurate image classification in 3 lines of code with AutoGluon (Medium, Feb 2020) - AutoGluon overview & example applications (Towards Data Science, Dec 2019) Documentation: - https://auto.gluon.ai/0.1.0/api/autogluon.predictor.html?highlight=num_bag_folds -------------------------------------------------------------------------------- """ train_data, test_data = split_data(gene_data_path, class_data_path, class_name=label_column, test_size=test_size, random_state=random_state, threshold=threshold, random_feature=random_feature) train_data = TabularDataset(train_data) test_data = TabularDataset(test_data) # Train the model using AutoGluon predictor = TabularPredictor(label=label_column).fit(train_data, hyperparameters=hyperparameters, time_limit=time_limit, num_bag_folds=num_bag_folds, num_stack_levels=num_stack_levels) # Get the feature importance importance = predictor.feature_importance(test_data, subsample_size=None) # Get the leaderboard of models leaderboard = predictor.leaderboard(test_data) return importance, leaderboard ================================================ FILE: TransProPy/AutogluonTimeLimit.py ================================================ from autogluon.tabular import TabularDataset, TabularPredictor from TransProPy.UtilsFunction2.splitdata import split_data def Autogluon_TimeLimit(gene_data_path, class_data_path, label_column, test_size, threshold, random_feature=None, num_bag_folds=None, num_stack_levels=None, time_limit=120, random_state=42): """ 2.1_autogluon_time-limit. Trains a model using AutoGluon on provided data path and returns feature importance and model leaderboard. ---------------------------------------------------------------------------------------------------------- Parameters: - gene_data_path (str): Path to the gene expression data CSV file. For example: '../data/gene_tpm.csv' - class_data_path (str): Path to the class data CSV file. For example: '../data/tumor_class.csv' - label_column (str): Name of the column in the dataset that is the target label for prediction. - test_size (float): Proportion of the data to be used as the test set. - threshold (float): The threshold used to filter out rows based on the proportion of non-zero values. - random_feature (int, optional): The number of random feature to select. If None, no random feature selection is performed. Default is None. - num_bag_folds (int, optional): *Please note: This parameter annotation source can be referred to the documentation link in References. Number of folds used for bagging of models. When `num_bag_folds = k`, training time is roughly increased by a factor of `k` (set = 0 to disable bagging). Disabled by default (0), but we recommend values between 5-10 to maximize predictive performance. Increasing num_bag_folds will result in models with lower bias but that are more prone to overfitting. `num_bag_folds = 1` is an invalid value, and will raise a ValueError. Values > 10 may produce diminishing returns, and can even harm overall results due to overfitting. To further improve predictions, avoid increasing `num_bag_folds` much beyond 10 and instead increase `num_bag_sets`. default = None - num_stack_levels (int, optional): *Please note: This parameter annotation source can be referred to the documentation link in References. Number of stacking levels to use in stack ensemble. Roughly increases model training time by factor of `num_stack_levels+1` (set = 0 to disable stack ensembling). Disabled by default (0), but we recommend values between 1-3 to maximize predictive performance. To prevent overfitting, `num_bag_folds >= 2` must also be set or else a ValueError will be raised. default = None - time_limit (int, optional): Time limit for training in seconds. Default is 120. - random_state (int): The seed used by the random number generator. Default is 42. ---------------------------------------------------------------------------------- Returns: - importance (DataFrame): DataFrame containing feature importance. - leaderboard (DataFrame): DataFrame containing model performance on the test data. ----------------------------------------------------------------------------------- References: Scientific Publications: - AutoGluon-Tabular: Robust and Accurate AutoML for Structured Data (Arxiv, 2020) - Fast, Accurate, and Simple Models for Tabular Data via Augmented Distillation (NeurIPS, 2020) - Multimodal AutoML on Structured Tables with Text Fields (ICML AutoML Workshop, 2021) Articles: - AutoGluon for tabular data: 3 lines of code to achieve top 1% in Kaggle competitions (AWS Open Source Blog, Mar 2020) - Accurate image classification in 3 lines of code with AutoGluon (Medium, Feb 2020) - AutoGluon overview & example applications (Towards Data Science, Dec 2019) Documentation: - https://auto.gluon.ai/0.1.0/api/autogluon.predictor.html?highlight=num_bag_folds -------------------------------------------------------------------------------- """ train_data, test_data = split_data(gene_data_path, class_data_path, class_name=label_column, test_size=test_size, random_state=random_state, threshold=threshold, random_feature=random_feature) train_data = TabularDataset(train_data) test_data = TabularDataset(test_data) # Train the model using AutoGluon predictor = TabularPredictor(label=label_column).fit(train_data, time_limit=time_limit, num_bag_folds=num_bag_folds, num_stack_levels=num_stack_levels) # Get the feature importance importance = predictor.feature_importance(test_data, subsample_size=None) # Get the leaderboard of models leaderboard = predictor.leaderboard(test_data) return importance, leaderboard ================================================ FILE: TransProPy/MACFCmain.py ================================================ from numpy import * from TransProPy.UtilsFunction1.LoadData import load_data from TransProPy.UtilsFunction1.FeatureRanking import feature_ranking from TransProPy.UtilsFunction1.PrintResults import print_results from collections import Counter def MACFCmain(max_rank, lable_name, threshold, data_path='../data/gene_tpm.csv', label_path='../data/tumor_class.csv'): """ 1.1_feature_ranking_modle. Applying the MACFC selection for relevant feature genes in classification. -------------------------------------------------------------------------- Parameters: max_rank: int The total number of gene combinations you want to obtain. lable_name: string For example: gender, age, altitude, temperature, quality, and other categorical variable names. data_path: string For example: '../data/gene_tpm.csv' Please note: Preprocess the input data in advance to remove samples that contain too many missing values or zeros. label_path: string For example: '../data/tumor_class.csv' Please note: The input sample categories must be in a numerical binary format, such as: 1,2,1,1,2,2,1. In this case, the numerical values represent the following classifications: 1: male; 2: female. threshold: float For example: 0.9 The set threshold indicates the proportion of non-zero value samples to all samples in each feature. -------------------------------------------------------------------------------------------------------- Returns: fr: list of strings representing ranked features. fre1: dictionary feature names as keys and their frequencies as values. frequency: list of tuples feature names and their frequencies. The frequency outputs a list sorted by occurrence frequency (in descending order). This list includes only those elements from the dictionary fre1 (which represents the counted frequencies of elements in the original data) that have an occurrence frequency greater than once, along with their frequencies. len(FName): integer count of AUC values greater than 0.5. FName: array of strings feature names after ranking with AUC > 0.5. Fauc: array of floats AUC values corresponding to the ranked feature names. --------------------------------------------------------- References: - Su,Y., Du,K., Wang,J., Wei,J. and Liu,J. (2022) Multi-variable AUC for sifting complementary features and its biomedical application. Briefings in Bioinformatics, 23, bbac029. ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- """ # load data f, c = load_data(lable_name, threshold, data_path, label_path) pos, neg = set(c) n0, n1 = list(c).count(pos), list(c).count(neg) FName, Fauc, fr, fre = feature_ranking(f, c, max_rank, pos, neg, n0, n1) # Note that here n0 and n1 are passed as parameters. fre1 = dict(Counter(fre)) fre2 = {key: value for key, value in fre1.items() if value > 1} frequency = sorted(fre2.items(), key=lambda kv: (kv[1], kv[0]), reverse=True) # print_results(fr, fre1, frequency, len(FName), FName, Fauc) return(fr, fre1, frequency, len(FName), FName, Fauc) ================================================ FILE: TransProPy/NewMACFCmain.py ================================================ from numpy import * from TransProPy.UtilsFunction1.LoadData import load_data from TransProPy.UtilsFunction1.NewFeatureRanking import new_feature_ranking from TransProPy.UtilsFunction1.PrintResults import print_results from collections import Counter def New_MACFCmain(AUC_threshold, max_rank, lable_name, threshold, data_path='../data/gene_tpm.csv', label_path='../data/tumor_class.csv'): """ 1.1_feature_ranking_modle. Applying the MACFC selection for relevant feature genes in classification. -------------------------------------------------------------------------- Parameters: AUC_threshold: float AUC threshold for feature selection. Features with AUC values higher than this threshold are recorded but not used in subsequent calculations. max_rank: int The total number of gene combinations you want to obtain. lable_name: string For example: gender, age, altitude, temperature, quality, and other categorical variable names. data_path: string For example: '../data/gene_tpm.csv' Please note: Preprocess the input data in advance to remove samples that contain too many missing values or zeros. label_path: string For example: '../data/tumor_class.csv' Please note: The input sample categories must be in a numerical binary format, such as: 1,2,1,1,2,2,1. In this case, the numerical values represent the following classifications: 1: male; 2: female. threshold: float For example: 0.9 The set threshold indicates the proportion of non-zero value samples to all samples in each feature. -------------------------------------------------------------------------------------------------------- Returns: high_auc_features: list of tuples This list contains tuples of feature indices and their corresponding AUC values, where the AUC value is greater than 0.95. Each tuple consists of the feature's index in string format and its AUC value as a float. This signifies that these features are highly predictive, with a strong ability to distinguish between different classes in the classification task. fr: list of strings representing ranked features. fre1: dictionary feature names as keys and their frequencies as values. frequency: list of tuples feature names and their frequencies. The frequency outputs a list sorted by occurrence frequency (in descending order). This list includes only those elements from the dictionary fre1 (which represents the counted frequencies of elements in the original data) that have an occurrence frequency greater than once, along with their frequencies. len(FName): integer count of AUC values greater than 0.5. FName: array of strings feature names after ranking with AUC > 0.5. Fauc: array of floats AUC values corresponding to the ranked feature names. --------------------------------------------------------- References: - Su,Y., Du,K., Wang,J., Wei,J. and Liu,J. (2022) Multi-variable AUC for sifting complementary features and its biomedical application. Briefings in Bioinformatics, 23, bbac029. ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- """ # load data f, c = load_data(lable_name, threshold, data_path, label_path) pos, neg = set(c) n0, n1 = list(c).count(pos), list(c).count(neg) high_auc_features, FName, Fauc, fr, fre = new_feature_ranking(f, c, AUC_threshold, max_rank, pos, neg, n0, n1) # Note that here n0 and n1 are passed as parameters. fre1 = dict(Counter(fre)) fre2 = {key: value for key, value in fre1.items() if value > 1} frequency = sorted(fre2.items(), key=lambda kv: (kv[1], kv[0]), reverse=True) # print_results(fr, fre1, frequency, len(FName), FName, Fauc) return(high_auc_features, fr, fre1, frequency, len(FName), FName, Fauc) ================================================ FILE: TransProPy/UtilsFunction1/Auc.py ================================================ from numpy import * def auc(tlofe, ne, n0, n1): lpp = 0 lnp = 0 flag = 0 aac = 0 for i in range(-1, -size(tlofe) - 1, -1): if tlofe[i] == ne: if flag == 1: aac += lnp * lpp flag = 0 lpp = 0 lnp += 1 else: if flag == 0: flag = 1 lpp += 1 aac += lnp * lpp auc = (n0 * n1 - aac) / (n0 * n1) return auc ================================================ FILE: TransProPy/UtilsFunction1/AutoNorm.py ================================================ from numpy import * def auto_norm(data): # data:(sample,feature) """ Normalization Function The auto_norm function is designed to normalize a two-dimensional array (matrix). The purpose of normalization is generally to bring all features into the same numerical range, facilitating subsequent analysis or model training. ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ Parameters: data: ndarray Order Requirements for Input Data: 1.This function does indeed have specific requirements for the row and column order of the input matrix data. Rows should represent individual samples, and columns should represent different features. In other words, each row vector represents a sample containing multiple features. 2.Each column of the matrix will be independently normalized, so different features should be placed in separate columns. ----------------------------------------------------------------------------------------------------------------------------- Returns: norm_data: ndarray It is the normalized data. ------------------------------ """ mins = data.min(0) maxs = data.max(0) ranges = maxs - mins row = data.shape[0] norm_data = data - tile(mins, (row, 1)) norm_data = norm_data / tile(ranges, (row, 1)) return norm_data ================================================ FILE: TransProPy/UtilsFunction1/FeatureRanking.py ================================================ from numpy import * from TransProPy.UtilsFunction1.Auc import auc def feature_ranking(f, c, max_rank, pos, neg, n0, n1): f_auc = [] f_no = [str(i) for i in range(shape(f)[0])] f_mtf = full((shape(f)[0], shape(f)[1]), False) f_ne = [] fl = shape(f_no)[0] # print('fl ', fl) for j in range(fl): argfv = argsort(f[j]) slofe = c[argfv] ne = slofe[0] a = auc(slofe, ne, n0, n1) if a < 0.5: if slofe[0] == slofe[-1]: a = 1 - a if ne == pos: ne = neg else: ne = pos f_auc.append(a) f_ne.append(ne) ml = 1 mr = 1 for i in range(1, size(slofe)): if slofe[i] == slofe[0]: ml += 1 else: break for i in range(-2, -size(slofe), -1): if slofe[i] == slofe[-1]: mr += 1 else: break mr = size(slofe) - mr if slofe[0] == slofe[-1]: if not slofe[0] == ne: ml = 0 else: mr = size(slofe) f_mtf[j][argfv[ml:mr]] = True # print(f_auc) arg_auc = argsort(-array(f_auc)) FName = array(f_no)[arg_auc] Fvalue = array(f)[arg_auc] Fauc = array(f_auc)[arg_auc] Fne = array(f_ne)[arg_auc] FmTF = array(f_mtf)[arg_auc] # print('SORT VALUE', Fvalue) # print('SORT M', FmTF) # print('SORT NAME', FName) # print('SORT AUC', Fauc) kk = 0 slen = 0 Fmcount = ones((len(FmTF[0]))) Fmcount = Fmcount.astype(bool) for i in range(fl): if Fauc[i] < 0.5: kk += 1 Fmcount &= FmTF[i] if True in Fmcount: slen += 1 # print('Totally ', kk, ' features with auc under 0.5') for i in range(fl): if Fauc[i] < 0.5: continue for j in range(i + 1, fl): if Fauc[j] < 0.5: continue nflg = 0 if not ((FmTF[i] & FmTF[j]) == FmTF[i]).all(): if not ((FmTF[i] & FmTF[j]) == FmTF[j]).all(): nflg = 1 if nflg == 0: if FmTF[i].sum() <= FmTF[j].sum(): Fauc[j] = -2 else: Fauc[i] = -2 break ii = [] gg = [] for i in range(fl): if Fauc[i] > 0.5: ii.append(i) else: gg.append(i) arg_auc_gg = argsort(-array(f_auc)[gg]) gg = [str(FName[i]) for i in array(gg)[arg_auc_gg]] # print('Totally ' + str(fl - len(ii)) + ' features are covered and removed.') # print(gg) FName = FName[ii] Fvalue = Fvalue[ii] Fauc = Fauc[ii] Fne = Fne[ii] FmTF = FmTF[ii] over = 0 if max_rank > len(ii): over = max_rank - len(ii) max_rank = len(ii) # start ranking rankset = [] # store unique features ranklist = [] # with overlap order = 0 while len(rankset) < max_rank: ## start selection rnk = 2 mv_auc = Fauc[order] fs = [FName[order]] cpms = FmTF[order] fl = shape(FName)[0] while mv_auc != 1: ft = 0 temp = 0 for j in range(fl): if FName[j] not in fs: tmpFmTF = cpms & FmTF[j] if not ((FmTF[j] & cpms) == cpms).all(): mauc = 0 for g in fs + [FName[j]]: fval = Fvalue[argwhere(FName == g)[0][0]][tmpFmTF] stwlofe = array(c)[tmpFmTF] argfv = argsort(fval) slofe = stwlofe[argfv] tauc = auc(slofe, Fne[argwhere(FName == g)[0][0]], n0, n1) mauc += tauc tmpauc = mauc / rnk if tmpauc > mv_auc: mv_auc = tmpauc ft = j temp = Fauc[j] elif tmpauc == mv_auc and Fauc[j] > temp: ft = j temp = Fauc[j] if mv_auc == -2 or ft == 0: break fs.append(FName[ft]) cpms = cpms & FmTF[ft] rnk += 1 # print('\nRank-' + str(rnk - 1) + ' mvAUC: ' + str(mv_auc) + ' Feature set:', fs) for i in fs: ranklist.append(i) if i not in rankset: rankset.append(i) order += 1 if over != 0: ranklist = ranklist + list(gg)[:over] rankset = rankset + list(gg)[:over] return FName, Fauc, rankset, ranklist ================================================ FILE: TransProPy/UtilsFunction1/FilterSamples.py ================================================ import pandas as pd def filter_samples(threshold, data_path='../data/gene_tpm.csv'): """ Remove samples with high zero expression. ----------------------------------------- Parameters data_path: string For example: '../data/gene_tpm.csv' Please note: The input data matrix should have genes as rows and samples as columns. threshold: float For example: 0.9 The set threshold indicates the proportion of non-zero value samples to all samples in each feature. -------------------------------------------------------------------------------------------------------- Return X: pandas.core.frame.DataFrame ----------------------------------- """ data = pd.read_csv(data_path, index_col=0, header=0) # Calculate the count of non-zero values in each row. non_zero_counts = data.astype(bool).sum(axis=1) # Set a threshold indicating the proportion of gene expressions that are zeros. # threshold = 0.9 # Filter rows based on the threshold. X = data[non_zero_counts / data.shape[1] > threshold] # Return the result. return X ================================================ FILE: TransProPy/UtilsFunction1/GeneNames.py ================================================ import os from pandas import read_csv, merge def gene_name(data_path='../data/gene_tpm.csv'): """ Extract gene_names data. ------------------------ Parameters: data_path: string For example: '../data/gene_tpm.csv' Please note: Preprocess the input data in advance to remove samples that contain too many missing values or zeros. The input data matrix should have genes as rows and samples as columns. --------------------------------------------------------------------------- Return: gene_name: list --------------------------------------------------------------------------- """ # Check if the data files exist at the given paths if not os.path.exists(data_path): raise FileNotFoundError( f"The data file was not found at '{data_path}'. Please ensure it's in the correct location.") # Load data and labels data = read_csv(data_path, header=0, index_col=0) # Assuming row names are gene names # Get the gene names directly from the row names (assuming row names are gene names) gene_names = data.index.tolist() return gene_names ================================================ FILE: TransProPy/UtilsFunction1/GeneToFeatureMapping.py ================================================ def gene_map_feature(gene_names, ranked_features): """ gene map feature. ------------------------ Parameters gene_names: list For example: ['GeneA', 'GeneB', 'GeneC', 'GeneD', 'GeneE'] containing strings ranked_features: list For example: [2, 0, 1] containing integers ----------------------- Return gene_to_feature_mapping: dictionary gene_to_feature_mapping is a Python dictionary type. It is used to map gene names to their corresponding feature (or ranked feature) names. ----------------------------------------------------------------------------------------------------------------------------------------------- """ gene_to_feature_mapping = {} for feature_index_str in ranked_features: feature_index = int(feature_index_str) if 0 <= feature_index < len(gene_names): gene_name = gene_names[feature_index] gene_to_feature_mapping[gene_name] = feature_index else: print(f"Invalid feature index: {feature_index}") return gene_to_feature_mapping ================================================ FILE: TransProPy/UtilsFunction1/LoadData.py ================================================ from pandas import * from numpy import * import os from TransProPy.UtilsFunction1.AutoNorm import auto_norm from TransProPy.UtilsFunction1.FilterSamples import filter_samples def load_data(lable_name, threshold, data_path='../data/gene_tpm.csv', label_path='../data/tumor_class.csv'): """ Data Reading and Transformation. Data normalization for constant value Extract matrix data and categorical data. --------------------------------------------- Parameters: lable_name: string For example: gender, age, altitude, temperature, quality, and other categorical variable names. data_path: string For example: '../data/gene_tpm.csv' Please note: Preprocess the input data in advance to remove samples that contain too many missing values or zeros. The input data matrix should have genes as rows and samples as columns. label_path: string For example: '../data/tumor_class.csv' Please note: The input CSV data should have rows representing sample names and columns representing class names. The input sample categories must be in a numerical binary format, such as: 1,2,1,1,2,2,1. In this case, the numerical values represent the following classifications: 1: male; 2: female. threshold: float For example: 0.9 The set threshold indicates the proportion of non-zero value samples to all samples in each feature. -------------------------------------------------------------------------------------------------------- Returns: transpose(f): ndarray A transposed feature-sample matrix. c: ndarray A NumPy array containing classification labels. --------------------------------------------------------- """ # Check if the data files exist at the given paths if not os.path.exists(data_path): raise FileNotFoundError( f"The data file was not found at '{data_path}'. Please ensure it's in the correct location.") if not os.path.exists(label_path): raise FileNotFoundError( f"The label file was not found at '{label_path}'. Please ensure it's in the correct location.") # Continue with the rest of your function data = filter_samples(threshold, data_path) # data = read_csv(data_path, header=0, index_col=0) data = data.transpose() lable = read_csv(label_path, header=0, index_col=0) lable = lable[lable_name] data = merge(data, lable, left_index=True, right_index=True) values = unique(data.values, axis=0) f = auto_norm(values[:, :-1]) # data normalization for constant value c = values[:, -1] return transpose(f), c ================================================ FILE: TransProPy/UtilsFunction1/NewFeatureRanking.py ================================================ from numpy import * from TransProPy.UtilsFunction1.Auc import auc def new_feature_ranking(f, c, AUC_threshold, max_rank, pos, neg, n0, n1): f_auc = [] f_no = [str(i) for i in range(shape(f)[0])] f_mtf = full((shape(f)[0], shape(f)[1]), False) f_ne = [] fl = shape(f_no)[0] # New addition: To store features with AUC greater than AUC_threshold and their AUC values high_auc_features = [] # Calculate the AUC for each feature for j in range(fl): argfv = argsort(f[j]) slofe = c[argfv] ne = slofe[0] a = auc(slofe, ne, n0, n1) if a < 0.5: if slofe[0] == slofe[-1]: a = 1 - a if ne == pos: ne = neg else: ne = pos f_auc.append(a) f_ne.append(ne) # New addition: Check and record features with AUC greater than AUC_threshold. if a > AUC_threshold: high_auc_features.append((f_no[j], a)) # Sort high_auc_features by AUC value high_auc_features = sorted(high_auc_features, key=lambda x: x[1], reverse=True) ml = 1 mr = 1 for i in range(1, size(slofe)): if slofe[i] == slofe[0]: ml += 1 else: break for i in range(-2, -size(slofe), -1): if slofe[i] == slofe[-1]: mr += 1 else: break mr = size(slofe) - mr if slofe[0] == slofe[-1]: if not slofe[0] == ne: ml = 0 else: mr = size(slofe) f_mtf[j][argfv[ml:mr]] = True # New addition: Exclude features with AUC greater than AUC_threshold from the original set. remaining_indices = [i for i, a in enumerate(f_auc) if a <= AUC_threshold] remaining_f_no = [f_no[i] for i in remaining_indices] remaining_f_auc = [f_auc[i] for i in remaining_indices] remaining_f_mtf = [f_mtf[i] for i in remaining_indices] remaining_f_ne = [f_ne[i] for i in remaining_indices] # Update 'fl' to the number of remaining features. fl = len(remaining_f_no) # Sort and process the remaining features. arg_auc = argsort(-array(remaining_f_auc)) FName = array(remaining_f_no)[arg_auc] Fvalue = array(f)[arg_auc] Fauc = array(remaining_f_auc)[arg_auc] Fne = array(remaining_f_ne)[arg_auc] FmTF = array(remaining_f_mtf)[arg_auc] kk = 0 slen = 0 Fmcount = ones((len(FmTF[0]))) Fmcount = Fmcount.astype(bool) for i in range(fl): if Fauc[i] < 0.5: kk += 1 Fmcount &= FmTF[i] if True in Fmcount: slen += 1 # print('Totally ', kk, ' features with auc under 0.5') for i in range(fl): if Fauc[i] < 0.5: continue for j in range(i + 1, fl): if Fauc[j] < 0.5: continue nflg = 0 if not ((FmTF[i] & FmTF[j]) == FmTF[i]).all(): if not ((FmTF[i] & FmTF[j]) == FmTF[j]).all(): nflg = 1 if nflg == 0: if FmTF[i].sum() <= FmTF[j].sum(): Fauc[j] = -2 else: Fauc[i] = -2 break ii = [] gg = [] for i in range(fl): if Fauc[i] > 0.5: ii.append(i) else: gg.append(i) arg_auc_gg = argsort(-array(f_auc)[gg]) gg = [str(FName[i]) for i in array(gg)[arg_auc_gg]] # print('Totally ' + str(fl - len(ii)) + ' features are covered and removed.') # print(gg) FName = FName[ii] Fvalue = Fvalue[ii] Fauc = Fauc[ii] Fne = Fne[ii] FmTF = FmTF[ii] over = 0 if max_rank > len(ii): over = max_rank - len(ii) max_rank = len(ii) # start ranking rankset = [] # store unique features ranklist = [] # with overlap order = 0 while len(rankset) < max_rank: ## start selection rnk = 2 mv_auc = Fauc[order] fs = [FName[order]] cpms = FmTF[order] fl = shape(FName)[0] while mv_auc != 1: ft = 0 temp = 0 for j in range(fl): if FName[j] not in fs: tmpFmTF = cpms & FmTF[j] if not ((FmTF[j] & cpms) == cpms).all(): mauc = 0 for g in fs + [FName[j]]: fval = Fvalue[argwhere(FName == g)[0][0]][tmpFmTF] stwlofe = array(c)[tmpFmTF] argfv = argsort(fval) slofe = stwlofe[argfv] tauc = auc(slofe, Fne[argwhere(FName == g)[0][0]], n0, n1) mauc += tauc tmpauc = mauc / rnk if tmpauc > mv_auc: mv_auc = tmpauc ft = j temp = Fauc[j] elif tmpauc == mv_auc and Fauc[j] > temp: ft = j temp = Fauc[j] if mv_auc == -2 or ft == 0: break fs.append(FName[ft]) cpms = cpms & FmTF[ft] rnk += 1 # print('\nRank-' + str(rnk - 1) + ' mvAUC: ' + str(mv_auc) + ' Feature set:', fs) for i in fs: ranklist.append(i) if i not in rankset: rankset.append(i) order += 1 if over != 0: ranklist = ranklist + list(gg)[:over] rankset = rankset + list(gg)[:over] # Return the features with an AUC greater than AUC_threshold, and other ranked and filtered feature information return high_auc_features, FName, Fauc, rankset, ranklist ================================================ FILE: TransProPy/UtilsFunction1/PrintResults.py ================================================ def print_results(high_auc_features, fr, fre1, frequency, len_FName, FName, Fauc): print('Ranked features (start from higher rank): ', fr) print('Features and its frequency: ', fre1) print('Sorted features with frequency higher than 1: ', frequency) print('The count of AUC values greater than 0.5: ', len_FName) print('The list of feature names after ranking (AUC > 0.5): ', FName) print('The list of AUC values corresponding to the ranked feature names: ', Fauc) ================================================ FILE: TransProPy/UtilsFunction1/__init__.py ================================================ ================================================ FILE: TransProPy/UtilsFunction2/LogTransform.py ================================================ import numpy as np def log_transform(data): """ Evaluate and potentially apply log2 transformation to data. -This function checks data against a set of criteria to determine if a log2 transformation is needed, applying the transformation if necessary. ----------------------------------------------------------------------------------------------------------------------------------------------- Parameters: -data (np.ndarray): A numerical numpy array. ------------------------------------------ Returns: -result(np.ndarray): The original data or the data transformed with log2. ----------------------------------------------------------------- """ # Calculate quantiles qx = np.quantile(data, [0., 0.25, 0.5, 0.75, 0.99, 1.0]) # Define conditions for log transformation LogC = (qx[4] > 100) or \ (qx[5] - qx[0] > 50 and qx[1] > 0) or \ (qx[1] > 0 and qx[1] < 1 and qx[3] > 1 and qx[3] < 2) # Apply log transformation based on conditions if LogC: data[data <= 0] = np.NaN # Use NaN for non-applicable data result = np.log2(data) print("log2 transform finished") else: result = data print("log2 transform not needed") return result ================================================ FILE: TransProPy/UtilsFunction2/__init__.py ================================================ ================================================ FILE: TransProPy/UtilsFunction2/splitdata.py ================================================ import pandas as pd from sklearn.utils import shuffle from sklearn.model_selection import train_test_split def split_data(gene_data_path, class_data_path, class_name, test_size=0.2, random_state=42, threshold=0.9, random_feature=None): """ Reads the gene expression and class data, processes it, and splits it into training and testing sets. ----------------------------------------------------------------------------------------------------- Parameters: - gene_data_path (str): Path to the CSV file containing the gene expression data. For example: '../data/gene_tpm.csv' - class_data_path (str): Path to the CSV file containing the class data. For example: '../data/tumor_class.csv' - class_name (str): The name of the class column in the class data. - test_size (float, optional): The proportion of the data to be used as the testing set. Default is 0.2. - random_state (int, optional): The seed used by the random number generator. Default is 42. - threshold (float, optional): The threshold used to filter out rows based on the proportion of non-zero values. Default is 0.9. - random_feature (int, optional): The number of random feature to select. If None, no random feature selection is performed. Default is None. --------------------------------------------------------------------------------------------------------------------------------------------- Returns: - train_data (pd.DataFrame): The training data. - test_data (pd.DataFrame): The testing data. --------------------------------------------- """ # Reading the data X = pd.read_csv(gene_data_path, index_col=0, header=0) y = pd.read_csv(class_data_path, index_col=0, header=0) # Finding common sample names between X(column names) and y(row names) common = X.columns.intersection(y.index) # Filtering out low-quality data non_zero_counts = X.astype(bool).sum(axis=1) X = X[non_zero_counts / X.shape[1] > threshold] # If random_sample is specified, perform random sampling on X if random_feature is not None: X = X.sample(n=random_feature, random_state=random_state) # Keeping only the common samples in X and y X = X.loc[:, common] y = y.loc[common] # Transposing X and merging it with the specified column from y X = X.transpose() Y = y[class_name] data = pd.merge(X, Y, left_index=True, right_index=True) # data is a DataFrame containing features and labels # First, randomize the data data = shuffle(data, random_state=42) # Splitting the data into training and validation sets # Then perform stratified sampling train_data, test_data = train_test_split(data, test_size=test_size, random_state=random_state, stratify=data[class_name]) return train_data, test_data ================================================ FILE: TransProPy/UtilsFunction3/EnsembleForRFE.py ================================================ from sklearn.base import BaseEstimator from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import GradientBoostingClassifier import numpy as np class EnsembleForRFE(BaseEstimator): """ Ensemble estimator for recursive feature elimination. Parameters: - svm_C: float, regularization parameter for SVM. - tree_max_depth: int, maximum depth of the decision tree. - tree_min_samples_split: int, minimum number of samples required to split an internal node. - gbm_learning_rate: float, learning rate for gradient boosting. - gbm_n_estimators: int, number of boosting stages to be run for gradient boosting. """ def __init__(self, svm_C=1.0, tree_max_depth=None, tree_min_samples_split=2, gbm_learning_rate=0.1, gbm_n_estimators=100): # Save passed parameters as class attributes self.svm_C = svm_C self.tree_max_depth = tree_max_depth self.tree_min_samples_split = tree_min_samples_split self.gbm_learning_rate = gbm_learning_rate self.gbm_n_estimators = gbm_n_estimators # Initialize individual models with the specified parameters self.svm = SVC(kernel="linear", probability=True, C=self.svm_C) self.tree = DecisionTreeClassifier(max_depth=self.tree_max_depth, min_samples_split=self.tree_min_samples_split) self.gbm = GradientBoostingClassifier(learning_rate=self.gbm_learning_rate, n_estimators=self.gbm_n_estimators) self.feature_importances_ = None # Initialize feature importances attribute def fit(self, X, y): """ Fit the individual models and compute aggregated feature importances. Parameters: - X: DataFrame, Feature dataset with shape (n_samples, n_features). - y: ndarray, 1-D array of target values with shape (n_samples,). Returns: - self: object, Instance of the model. """ # Fit individual models self.svm.fit(X, y) self.tree.fit(X, y) self.gbm.fit(X, y) # Calculate feature importances and store as attributes svm_importances = np.abs(self.svm.coef_[0]) tree_importances = self.tree.feature_importances_ gbm_importances = self.gbm.feature_importances_ # Average feature importances self.feature_importances_ = (svm_importances + tree_importances + gbm_importances) / 3 return self def predict(self, X): """ Predict class labels for samples in X using a soft voting mechanism. Parameters: - X: DataFrame, Input features. Returns: - Predicted class labels. """ # Get the probability predictions from individual models probabilities = np.array([self.svm.predict_proba(X), self.tree.predict_proba(X), self.gbm.predict_proba(X)]) # Average probabilities for soft voting avg_prob = np.mean(probabilities, axis=0) # Predict class labels based on the highest probability return np.argmax(avg_prob, axis=1) def set_params(self, **params): """ Set parameters for the ensemble estimator. This will be used by hyperparameter optimization methods like RandomizedSearchCV to update the parameters of the individual models. Parameters: - **params: Keyword arguments for parameter names and values. """ # Update the parameter values based on provided keyword arguments for key, value in params.items(): if key in ['svm_C', 'tree_max_depth', 'tree_min_samples_split', 'gbm_learning_rate', 'gbm_n_estimators']: setattr(self, key, value) # Re-initialize the models with the updated parameters self.svm = SVC(kernel="linear", probability=True, C=self.svm_C) self.tree = DecisionTreeClassifier(max_depth=self.tree_max_depth, min_samples_split=self.tree_min_samples_split) self.gbm = GradientBoostingClassifier(learning_rate=self.gbm_learning_rate, n_estimators=self.gbm_n_estimators) return self ================================================ FILE: TransProPy/UtilsFunction3/ExtractAndSaveResults.py ================================================ # TransProPy.UtilsFunction3.extract_and_save_results.py import numpy as np import pandas as pd import matplotlib.pyplot as plt from TransProPy.UtilsFunction3.PrintBoxedText import print_boxed_text from sklearn.metrics import roc_curve, roc_auc_score from sklearn.model_selection import cross_val_predict from sklearn.model_selection import StratifiedKFold def extract_and_save_results( clf, X, Y, save_path, n_cv, show_plot=False, use_tkagg=False): """ Extract and save various results from the trained model. Parameters: - clf: trained model (RandomizedSearchCV object). - X: DataFrame, feature data used for training. - save_path: str, base path for saving results. - show_plot: bool, whether to display the plot. - use_tkagg: bool, whether to use 'TkAgg' backend for matplotlib. Generally, choose True when using in PyCharm IDE, and choose False when rendering file.qmd to an HTML file. """ # Setting the matplotlib backend to 'TkAgg' if specified if use_tkagg: import matplotlib matplotlib.use('TkAgg') # Extracting cross-validation results cv_results = clf.cv_results_ mean_test_scores = cv_results['mean_test_score'] # Calculate the average test score for each iteration n_iterations = len(mean_test_scores) # Plotting and saving the accuracy per iteration figure plt.figure(figsize=(6, 4), facecolor='#f0f8fe') plt.plot(range(1, n_iterations + 1), mean_test_scores, marker='o') plt.title('Model Accuracy per Iteration') plt.xlabel('Iteration') plt.ylabel('Mean Test Accuracy') plt.grid(True, color='#11479c', alpha=0.2) # Get the current axes (ax), and set the background color of the plot area to white ax = plt.gca() ax.set_facecolor('#e1f0fb') # Call tight_layout to automatically adjust the layout plt.tight_layout() plt.savefig(save_path + "Model_Accuracy_per_Iteration_figure.pdf", format='pdf') # Optionally display the plot if show_plot: plt.show() # plotting the ROC curve # Predict probabilities y_probas = cross_val_predict(clf.best_estimator_, X, Y, cv=StratifiedKFold(n_splits=n_cv), method='predict_proba') # Take the probability of the positive class y_scores = y_probas[:, 1] # Calculate values for the ROC curve fpr, tpr, thresholds = roc_curve(Y, y_scores) # Calculate AUC value roc_auc = roc_auc_score(Y, y_scores) # Plot and save the ROC curve plt.figure(figsize=(6, 4), facecolor='#f0f8fe') plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') # Diagonal line for a random classifier plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve') plt.legend(loc='lower right') # Get the current axes (ax), and set the background color of the plot area to white ax = plt.gca() ax.set_facecolor('#e1f0fb') # Call tight_layout to automatically adjust the layout plt.tight_layout() plt.savefig(save_path + "ROC_Curve_figure.pdf", format='pdf') # Optionally display the plot if show_plot: plt.show() # Extracting feature selection results feature_union = clf.best_estimator_.named_steps['feature_selection'] rfecv = feature_union.transformer_list[0][1] selectkbest = feature_union.transformer_list[1][1] selected_features_rfecv = rfecv.support_ selected_features_selectkbest = selectkbest.get_support() # Print selected features print_boxed_text("Features selected by RFECV:") print(X.columns[selected_features_rfecv]) print_boxed_text("Features selected by SelectKBest:") print(X.columns[selected_features_selectkbest]) # Combining and saving selected features combined_selected_features = np.logical_or(selected_features_rfecv, selected_features_selectkbest) combined_features_df = pd.DataFrame({'Feature': X.columns[combined_selected_features]}) combined_features_df.to_csv(save_path + 'combined_features.csv', index=False) print_boxed_text(f"Total number of selected features: {combined_features_df.shape[0]}") # Extracting and saving EnsembleForRFE feature importances ensemble_for_rfe = feature_union.transformer_list[0][1].estimator_ feature_importances_ensemble = ensemble_for_rfe.feature_importances_ importances_ensemble = zip(X.columns[selected_features_rfecv], feature_importances_ensemble) sorted_importances_ensemble = sorted(importances_ensemble, key=lambda x: x[1], reverse=True) df_importances_ensemble = pd.DataFrame(sorted_importances_ensemble, columns=['Feature', 'Importance']) df_importances_ensemble.to_csv(save_path + 'ensemble_importances.csv', index=False) print_boxed_text("Feature Importances from EnsembleForRFE:") print(df_importances_ensemble) # Extracting and saving SelectKBest scores selectkbest_scores = selectkbest.scores_[selected_features_selectkbest] scores_selectkbest = zip(X.columns[selected_features_selectkbest], selectkbest_scores) sorted_scores_selectkbest = sorted(scores_selectkbest, key=lambda x: x[1], reverse=True) df_scores_selectkbest = pd.DataFrame(sorted_scores_selectkbest, columns=['Feature', 'Score']) df_scores_selectkbest.to_csv(save_path + 'selectkbest_scores.csv', index=False) print_boxed_text("Scores from SelectKBest:") print(df_scores_selectkbest) ================================================ FILE: TransProPy/UtilsFunction3/ExtractCommonSamples.py ================================================ def extract_common_samples(X, Y): """ Extracts common samples (rows) from two DataFrames based on their indices. Parameters: X (pd.DataFrame): First DataFrame. Y (pd.DataFrame): Second DataFrame. Returns: pd.DataFrame, pd.DataFrame: Two DataFrames containing only the rows that are common in both. """ # Find common indices common_indices = X.index.intersection(Y.index) # Filter both DataFrames to keep only common rows X_common = X.loc[common_indices] Y_common = Y.loc[common_indices] return X_common, Y_common ================================================ FILE: TransProPy/UtilsFunction3/LoadAndPreprocessData.py ================================================ # TransProPy.UtilsFunction3.load_and_preprocess_data.py from TransProPy.UtilsFunction3.LoadFilterTranspose import load_filter_transpose from TransProPy.UtilsFunction3.LoadEncodeLabels import load_encode_labels from TransProPy.UtilsFunction3.ExtractCommonSamples import extract_common_samples def load_and_preprocess_data(feature_file, label_file, label_column, threshold): """ Load and preprocess the data. Parameters: - feature_file: str, path to the feature data file. - label_file: str, path to the label data file. - label_column: str, column name of the labels in the label file. - threshold: float, threshold for filtering in load_filter_transpose function. Returns: - X: DataFrame, preprocessed feature data. - Y: ndarray, preprocessed label data. """ X = load_filter_transpose(threshold, feature_file) # Load and filter features Y = load_encode_labels(label_file, label_column) # Load and encode labels X, Y = extract_common_samples(X, Y) # Extract common samples Y = Y.values.ravel() # Flatten Y to 1D array return X, Y ================================================ FILE: TransProPy/UtilsFunction3/LoadEncodeLabels.py ================================================ from sklearn.preprocessing import LabelEncoder import pandas as pd def load_encode_labels(file_path, column_name): """ Reads a CSV file containing labels and encodes categorical labels in the specified column to numeric labels. Parameters: file_path (str): Path to the CSV file containing labels. column_name (str): Name of the column to be encoded. Returns: Y (pd.DataFrame): A DataFrame containing the encoded numeric labels. """ # Load the data y = pd.read_csv(file_path, index_col=0, header=0) # Check if the specified column exists in the DataFrame if column_name not in y.columns: raise ValueError(f"Column '{column_name}' not found in the DataFrame") # Create an instance of LabelEncoder le = LabelEncoder() # Apply LabelEncoder to the specified column y_encoded = le.fit_transform(y[column_name]) # Many Scikit-learn models require Y to be numerical. Therefore, if Y is categorical, use the fit_transform method of LabelEncoder to convert the character labels of Y into integers. # Convert the encoded labels back to a DataFrame Y = pd.DataFrame(y_encoded, index=y.index, columns=[column_name]) return Y ================================================ FILE: TransProPy/UtilsFunction3/LoadFilterTranspose.py ================================================ import pandas as pd def load_filter_transpose(threshold, data_path='../data/gene_tpm.csv'): """ Remove samples with high zero expression. ----------------------------------------- Parameters data_path: string For example: '../data/gene_tpm.csv' Please note: The input data matrix should have genes as rows and samples as columns. threshold: float For example: 0.9 The set threshold indicates the proportion of non-zero value samples to all samples in each feature. -------------------------------------------------------------------------------------------------------- Return X: pandas.core.frame.DataFrame ----------------------------------- """ data = pd.read_csv(data_path, index_col=0, header=0) # Calculate the count of non-zero values in each row. non_zero_counts = data.astype(bool).sum(axis=1) # Set a threshold indicating the proportion of gene expressions that are zeros. # threshold = 0.9 # Filter rows based on the threshold. X = data[non_zero_counts / data.shape[1] > threshold] X = X.transpose() # Return the result. return X ================================================ FILE: TransProPy/UtilsFunction3/LoggingCustomScorer.py ================================================ from sklearn.metrics import accuracy_score import logging import time def logging_custom_scorer(n_iter=10, n_cv=5): """ Creates a custom scorer function for use in model evaluation processes. This scorer logs both the accuracy score and the time taken for each call. Parameters: n_iter (int): Number of iterations for the search process. Default is 10. n_cv (int): Number of cross-validation splits. Default is 5. Returns: function: A custom scorer function that logs the accuracy score and time taken for each call. """ # Initialize the time for the first call last_time = time.time() def custom_scorer(y_true, y_pred): """ Inner function to calculate the accuracy score, log it, and measure the time taken. Parameters: y_true (array-like): True labels. y_pred (array-like): Predicted labels by the model. Returns: float: The accuracy score. """ nonlocal last_time # Reference the last_time from the outer scope # Record the current time and calculate the elapsed time since the last call current_time = time.time() elapsed = current_time - last_time last_time = current_time # Update last_time for the next call # Calculate the accuracy score score = accuracy_score(y_true, y_pred) # Log the accuracy and the time taken for this scoring iteration logging.info(f"One scoring iteration completed, accuracy: {score}, time taken: {elapsed:.2f} seconds") return score return custom_scorer ================================================ FILE: TransProPy/UtilsFunction3/PrintBoxedText.py ================================================ def print_boxed_text(title): """ Prints a title in a boxed format. This function creates a box around the given title text using hash (#) and equals (=) symbols. It prints the title with a border on the top and bottom, making it stand out in the console output. Parameters: - title: str, the text to be displayed inside the box. Returns: None. This function directly prints the formatted title to the console. """ # Create the top and bottom border line of the box. # The border line consists of a hash symbol, followed by equals symbols # the length of the title plus two (for padding), and then another hash symbol. border_line = "#" + "=" * (len(title) + 2) + "#" # Print the top border line. print("\n" + border_line) # Print the title, surrounded by hash symbols and padded with one space on each side. print(f"# {title} #") # Print the bottom border line. print(border_line) ================================================ FILE: TransProPy/UtilsFunction3/SetupFeatureSelection.py ================================================ # TransProPy.UtilsFunction3.setup_feature_selection.py from sklearn.feature_selection import RFECV, SelectKBest, mutual_info_classif from sklearn.model_selection import StratifiedKFold from sklearn.pipeline import FeatureUnion from TransProPy.UtilsFunction3.EnsembleForRFE import EnsembleForRFE def setup_feature_selection(): """ Set up the feature selection process. Returns: - feature_selection: FeatureUnion, combined feature selection process. """ ensemble_estimator = EnsembleForRFE() rfecv = RFECV(estimator=ensemble_estimator, cv=StratifiedKFold(5), scoring='accuracy') selectkbest = SelectKBest(score_func=mutual_info_classif) return FeatureUnion([("rfecv", rfecv), ("selectkbest", selectkbest)]) ================================================ FILE: TransProPy/UtilsFunction3/SetupLoggingAndProgressBar.py ================================================ import logging from tqdm import tqdm def setup_logging_and_progress_bar(n_iter, n_cv): """ Set up logging and initialize a tqdm progress bar. Parameters: n_iter (int): Number of iterations for RandomizedSearchCV. n_cv (int): Number of cross-validation folds. Returns: tqdm object: An initialized tqdm progress bar. """ # Configure basic logging - this time, without filename and filemode logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s') # Create a file handler for logging to a file file_handler = logging.FileHandler('progress.log', mode='w') file_handler.setLevel(logging.INFO) file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')) # Create a stream handler for logging to the console # stream_handler = logging.StreamHandler() # stream_handler.setLevel(logging.INFO) # stream_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s: %(message)s')) # Get the default logger and add the two handlers to it logger = logging.getLogger() logger.addHandler(file_handler) # logger.addHandler(stream_handler) # Calculate total iterations total_iterations = n_iter * n_cv # Initialize and return tqdm progress bar pbar = tqdm(total=total_iterations, desc='RandomizedSearchCV Progress') return pbar ================================================ FILE: TransProPy/UtilsFunction3/TqdmCustomScorer.py ================================================ from sklearn.metrics import make_scorer, accuracy_score from tqdm import tqdm def tqdm_custom_scorer(n_iter=10, n_cv=5): """ This function creates a custom scorer for use in model evaluation processes like RandomizedSearchCV. It integrates a progress bar to track the evaluation process. Parameters: n_iter (int): Number of iterations for the search process. Default is 10. n_cv (int): Number of cross-validation splits. Default is 5. Returns: function: A custom scorer function that can be used with model evaluation methods. """ # Initialize a tqdm progress bar with a total count based on the number of iterations and CV splits pbar = tqdm(total=n_iter * n_cv, desc='RandomizedSearchCV progress') # Define an inner function that will be used as the scorer def custom_scorer(y_true, y_pred): """ Inner function to calculate the accuracy score and update the progress bar. Parameters: y_true (array-like): True labels. y_pred (array-like): Predicted labels by the model. Returns: float: The accuracy score. """ # Calculate the accuracy score score = accuracy_score(y_true, y_pred) # Update the progress bar pbar.update() return score # Return the custom scorer function return custom_scorer ================================================ FILE: TransProPy/UtilsFunction3/TrainModel.py ================================================ # TransProPy.UtilsFunction3.train_model.py from sklearn.pipeline import Pipeline from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV from sklearn.metrics import make_scorer from TransProPy.UtilsFunction3.LoggingCustomScorer import logging_custom_scorer from sklearn.preprocessing import StandardScaler from sklearn.ensemble import StackingClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.linear_model import LogisticRegression def train_model(X, Y, feature_selection, parameters, n_iter, n_cv, n_jobs=9): """ Set up and run the model training process. Parameters: - X: DataFrame, feature data. - Y: ndarray, label data. - feature_selection: FeatureUnion, the feature selection process. - parameters: dict, parameters for RandomizedSearchCV. - n_iter: int, number of iterations for RandomizedSearchCV. - n_cv: int, number of cross-validation folds. - n_jobs: int, number of jobs to run in parallel (default is 9). Returns: - clf: RandomizedSearchCV object after fitting. """ feature_selection_pipeline = Pipeline([ ('scale', StandardScaler()), ('feature_selection', feature_selection), ('stacking', StackingClassifier( estimators=[ ('svm', SVC(probability=True)), ('dt', DecisionTreeClassifier()), ('gbm', GradientBoostingClassifier()) ], final_estimator=LogisticRegression())) ]) clf = RandomizedSearchCV( feature_selection_pipeline, parameters, cv=StratifiedKFold(n_splits=n_cv), scoring=make_scorer(logging_custom_scorer(n_iter=n_iter, n_cv=n_cv)), n_iter=n_iter, random_state=0, error_score='raise', n_jobs=n_jobs # Use the customizable n_jobs parameter ) return clf ================================================ FILE: TransProPy/UtilsFunction3/UpdateProgressBar.py ================================================ def update_progress_bar(pbar, log_file='progress.log'): """ Read the number of log entries in the log file and update the tqdm progress bar. Parameters: pbar (tqdm): The tqdm progress bar object. log_file (str): Path to the log file, default is 'progress.log'. """ def count_logged_iterations(): """Read and return the number of log entries in the log file.""" with open(log_file, 'r') as file: return sum(1 for _ in file) # Read the log file and update the progress bar logged_iterations = count_logged_iterations() pbar.update(logged_iterations - pbar.n) # Only increase by the number of new iterations logged ================================================ FILE: TransProPy/UtilsFunction3/__init__.py ================================================ ================================================ FILE: TransProPy/__init__.py ================================================ ================================================ FILE: setup.py ================================================ from setuptools import setup, find_packages setup( name='transpropy', version='1.0.0', packages=find_packages(), install_requires=[ "numpy", "pandas", "setuptools", "scikit-learn", "tqdm" ], url='https://github.com/SSSYDYSSS/TransProPy', author='Yu Dongyue', author_email='yudongyue@mail.nankai.edu.cn', description='A collection of deep learning models that integrate algorithms and various machine learning approaches to extract features (genes) effective for classification and attribute them accordingly.' )