[
  {
    "path": ".github/workflows/python-package.yml",
    "content": "name: Python package\n\non:\n  push:\n    branches: [ \"main\" ]\n  pull_request:\n    branches: [ \"main\" ]\n\njobs:\n  build:\n\n    runs-on: ubuntu-latest\n    strategy:\n      fail-fast: false\n      matrix:\n        python-version: [\"3.9\", \"3.10\", \"3.11\"]\n\n    steps:\n    - uses: actions/checkout@v4\n    - name: Set up Python ${{ matrix.python-version }}\n      uses: actions/setup-python@v3\n      with:\n        python-version: ${{ matrix.python-version }}\n    - name: Install dependencies\n      run: |\n        python -m pip install --upgrade pip\n        python -m pip install flake8 pytest\n        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi\n    - name: Lint with flake8\n      run: |\n        # Lint the code but don't fail the build on errors\n        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics || true\n        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics\n    - name: Test with pytest\n      run: |\n        # Run pytest but don't fail the build if no tests are defined or if tests fail\n        pytest || true\n"
  },
  {
    "path": ".gitignore",
    "content": "# Created by https://www.toptal.com/developers/gitignore/api/python\n# Edit at https://www.toptal.com/developers/gitignore?templates=python\n\n### Python ###\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n*.py,cover\n.hypothesis/\n.pytest_cache/\ncover/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\ndb.sqlite3-journal\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\n.pybuilder/\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# IPython\nprofile_default/\nipython_config.py\n\n# pyenv\n#   For a library or package, you might want to ignore these files since the code is\n#   intended to run in multiple environments; otherwise, check them in:\n# .python-version\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\n#Pipfile.lock\n\n# poetry\n#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.\n#   This is especially recommended for binary packages to ensure reproducibility, and is more\n#   commonly ignored for libraries.\n#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control\n#poetry.lock\n\n# pdm\n#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.\n#pdm.lock\n#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it\n#   in version control.\n#   https://pdm.fming.dev/#use-with-ide\n.pdm.toml\n\n# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm\n__pypackages__/\n\n# Celery stuff\ncelerybeat-schedule\ncelerybeat.pid\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\n\n# Pyre type checker\n.pyre/\n\n# pytype static type analyzer\n.pytype/\n\n# Cython debug symbols\ncython_debug/\n\n# PyCharm\n#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can\n#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore\n#  and can be added to the global gitignore or merged into this file.  For a more nuclear\n#  option (not recommended) you can uncomment the following to ignore the entire idea folder.\n#.idea/\n\n### Python Patch ###\n# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration\npoetry.toml\n\n# ruff\n.ruff_cache/\n\n# LSP config files\npyrightconfig.json\n\n# End of https://www.toptal.com/developers/gitignore/api/python"
  },
  {
    "path": ".idea/.gitignore",
    "content": "# Default ignored files\n/shelf/\n/workspace.xml\n# Editor-based HTTP Client requests\n/httpRequests/\n# Datasource local storage ignored files\n/dataSources/\n/dataSources.local.xml\n"
  },
  {
    "path": ".idea/TransProPy.iml",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<module type=\"PYTHON_MODULE\" version=\"4\">\n  <component name=\"NewModuleRootManager\">\n    <content url=\"file://$MODULE_DIR$\" />\n    <orderEntry type=\"inheritedJdk\" />\n    <orderEntry type=\"sourceFolder\" forTests=\"false\" />\n  </component>\n  <component name=\"PyDocumentationSettings\">\n    <option name=\"format\" value=\"GOOGLE\" />\n    <option name=\"myDocStringFormat\" value=\"Google\" />\n  </component>\n</module>"
  },
  {
    "path": ".idea/inspectionProfiles/profiles_settings.xml",
    "content": "<component name=\"InspectionProjectProfileManager\">\n  <settings>\n    <option name=\"USE_PROJECT_PROFILE\" value=\"false\" />\n    <version value=\"1.0\" />\n  </settings>\n</component>"
  },
  {
    "path": ".idea/modules.xml",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n  <component name=\"ProjectModuleManager\">\n    <modules>\n      <module fileurl=\"file://$PROJECT_DIR$/.idea/TransProPy.iml\" filepath=\"$PROJECT_DIR$/.idea/TransProPy.iml\" />\n    </modules>\n  </component>\n</project>"
  },
  {
    "path": ".idea/vcs.xml",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<project version=\"4\">\n  <component name=\"VcsDirectoryMappings\">\n    <mapping directory=\"$PROJECT_DIR$\" vcs=\"Git\" />\n  </component>\n</project>"
  },
  {
    "path": "LICENSE",
    "content": "BSD 3-Clause License\n\nCopyright (c) 2023- Yu Dongyue, SuperOmics All rights reserved.\n\nRedistribution and use in source and binary forms, with or without\nmodification, are permitted provided that the following conditions are met:\n\n* Redistributions of source code must retain the above copyright notice, this\n  list of conditions and the following disclaimer.\n\n* Redistributions in binary form must reproduce the above copyright notice,\n  this list of conditions and the following disclaimer in the documentation\n  and/or other materials provided with the distribution.\n\n* Neither the name of the copyright holder nor the names of its\n  contributors may be used to endorse or promote products derived from\n  this software without specific prior written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\nAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\nIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\nDISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\nFOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\nSERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\nCAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\nOR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\nOF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE."
  },
  {
    "path": "README.md",
    "content": "[![pypi-badge](https://img.shields.io/pypi/v/transpropy)](https://pypi.org/project/transpropy/) [![License](https://img.shields.io/github/license/SSSYDYSSS/TransProPy)](https://github.com/SSSYDYSSS/TransProPy/blob/main/LICENSE) [![Build Status](https://github.com/SSSYDYSSS/TransProPy/actions/workflows/python-package.yml/badge.svg)](https://github.com/SSSYDYSSS/TransProPy/actions/workflows/python-package.yml)\n\n# TransProPy\n<img src=\"image/TransProPy_Pylogo.png\" alt=\"TransProPy Logo\" width=\"250\" height=\"250\" align=\"right\">\nA python package that integrate algorithms and various machine learning approaches to extract features (genes) effective for classification and attribute them accordingly.\n\n## Installation\n\n```bash\npip3 install TransProPy\npip3 install git+https://github.com/SSSYDYSSS/TransProPy.git\n```\n\n## Usage\n```python \n# e.g.: \nfrom TransProCalc import my_function\nmy_function()\n```\n\n\n## Citation\n\nIf you use TransPro in your research, please cite:\n\nDongyue Yu; Chen Li; Shuo Yan; Lujiale Guo; Jingyu Liang; Shengquan Chen*; Wenjun Bu* (2026). Comparative Evaluation of Differential Gene Selection Methods in Transcriptomics: Bias Correction and Visualization with TransPro. Manuscript in preparation.\n\n**Correspondence:**  \n\nShengquan Chen — School of Mathematical Sciences and LPMC, Nankai University, Tianjin 300071, China. \n\nWenjun Bu — Institute of Entomology, College of Life Sciences, Nankai University, Tianjin 300071, China.\n\n\n## More examples see\nTransProPy Manual：https://sssydysss.github.io/TransProPyBook/\n\n## Contributing\nPull requests are welcome. For major changes, please open an issue first to discuss what you would like to change.\n\n## License\nThis project is licensed under the BSD 3-Clause License - see the [LICENSE](./LICENSE) file for details.\n\n"
  },
  {
    "path": "TransProPy/AutoFeatureSelection.py",
    "content": "import threading\nimport time\nfrom scipy.stats import reciprocal, randint\nfrom TransProPy.UtilsFunction3.LoadAndPreprocessData import load_and_preprocess_data\nfrom TransProPy.UtilsFunction3.SetupFeatureSelection import setup_feature_selection\nfrom TransProPy.UtilsFunction3.TrainModel import train_model\nfrom TransProPy.UtilsFunction3.ExtractAndSaveResults import extract_and_save_results\nfrom TransProPy.UtilsFunction3.SetupLoggingAndProgressBar import setup_logging_and_progress_bar\nfrom TransProPy.UtilsFunction3.UpdateProgressBar import update_progress_bar\n\ndef auto_feature_selection(data_file, label_file, label_col, threshold, show_plot, show_progress, n_iter=5, n_cv=5, n_jobs=9, save_path='../data/', sleep_interval=1, use_tkagg=False):\n    \"\"\"\n    Run the complete analysis pipeline from data loading to training and result extraction.\n\n    Parameters:\n    - data_file: str, path to the feature data file.\n    - label_file: str, path to the label data file.\n    - label_col: str, name of the label column.\n    - threshold: float, threshold for data preprocessing.\n    - show_plot: bool, whether to display plot.\n    - show_progress: bool, whether to show progress bar.\n    - n_iter: int, number of iterations for RandomizedSearchCV.\n    - n_cv: int, number of folds for cross-validation.\n    - n_jobs: int, number of parallel jobs for RandomizedSearchCV.\n    - save_path: str, path to save results.\n    - sleep_interval: int, interval time in seconds for progress bar update.\n    - use_tkagg: bool, whether to use 'TkAgg' backend for matplotlib. Generally, choose False when using in PyCharm IDE, and choose True when rendering file.qmd to an HTML file.\n    \"\"\"\n\n    # Load and preprocess data\n    X, Y = load_and_preprocess_data(data_file, label_file, label_col, threshold)\n\n    # Set up feature selection\n    feature_selection = setup_feature_selection()\n\n    # Define parameters for RandomizedSearchCV\n    parameters = {\n        'feature_selection__rfecv__estimator__svm__C': reciprocal(0.001, 1000),\n        'feature_selection__rfecv__estimator__tree__max_depth': randint(2, 10),\n        'feature_selection__rfecv__estimator__tree__min_samples_split': randint(2, 10),\n        'feature_selection__rfecv__estimator__gbm__learning_rate': reciprocal(0.01, 0.2),\n        'feature_selection__rfecv__estimator__gbm__n_estimators': randint(100, 500),\n        'feature_selection__rfecv__step': randint(10, 150),\n        'feature_selection__rfecv__min_features_to_select': randint(10, 1000),\n        'feature_selection__selectkbest__k': randint(10, 200),\n        'stacking__final_estimator__C': reciprocal(0.001, 1000)  # Parameter for logistic regression in stacking classifier\n    }\n\n    # Train the model\n    clf = train_model(X, Y, feature_selection, parameters, n_iter, n_cv, n_jobs)\n\n    # Define a function to run RandomizedSearchCV\n    def run_randomized_search():\n        clf.fit(X, Y)\n\n    # Initialize tqdm progress bar and logging\n    if show_progress:\n        progress_bar = setup_logging_and_progress_bar(n_iter, n_cv)\n        search_thread = threading.Thread(target=run_randomized_search) # Use threading to run RandomizedSearchCV\n        search_thread.start()\n\n        # Update the progress bar in the main thread\n        while search_thread.is_alive():\n            update_progress_bar(progress_bar)\n            time.sleep(sleep_interval)\n\n        # Ensure RandomizedSearchCV completes\n        search_thread.join() # The main thread will wait for the search thread to complete all search and computation processes before continuing to execute the code after the main thread. This ensures that the main thread continues only after the search process is fully completed and the results are returned.\n    else:\n        run_randomized_search()\n\n    # Extract and save results\n    extract_and_save_results(clf, X, Y, save_path, n_cv, show_plot, use_tkagg)\n\n"
  },
  {
    "path": "TransProPy/AutogluonSelectML.py",
    "content": "from autogluon.tabular import TabularDataset, TabularPredictor\nfrom TransProPy.UtilsFunction2.splitdata import split_data\n\ndef AutoGluon_SelectML(gene_data_path, class_data_path, label_column, test_size, threshold, hyperparameters=None, random_feature=None, num_bag_folds=None, num_stack_levels=None, time_limit=120, random_state=42):\n    \"\"\"\n    Trains a model using AutoGluon on provided data path and returns feature importance and model leaderboard.\n    ----------------------------------------------------------------------------------------------------------\n    Parameters:\n    - gene_data_path (str): Path to the gene expression data CSV file.\n        For example: '../data/gene_tpm.csv'\n    - class_data_path (str): Path to the class data CSV file.\n        For example: '../data/tumor_class.csv'\n    - label_column (str): Name of the column in the dataset that is the target label for prediction.\n    - test_size (float): Proportion of the data to be used as the test set.\n    - threshold (float): The threshold used to filter out rows based on the proportion of non-zero values.\n    - hyperparameters (dict, optional): Dictionary of hyperparameters for the models.\n        For example: {'GBM': {}, 'RF': {}}\n    - random_feature (int, optional): The number of random feature to select. If None, no random feature selection is performed. Default is None.\n    - num_bag_folds (int, optional):\n       *Please note: This parameter annotation source can be referred to the documentation link in References.\n        Number of folds used for bagging of models. When `num_bag_folds = k`, training time is roughly increased by a factor of `k` (set = 0 to disable bagging).\n        Disabled by default (0), but we recommend values between 5-10 to maximize predictive performance.\n        Increasing num_bag_folds will result in models with lower bias but that are more prone to overfitting.\n        `num_bag_folds = 1` is an invalid value, and will raise a ValueError.\n        Values > 10 may produce diminishing returns, and can even harm overall results due to overfitting.\n        To further improve predictions, avoid increasing `num_bag_folds` much beyond 10 and instead increase `num_bag_sets`.\n        default = None\n    - num_stack_levels (int, optional):\n       *Please note: This parameter annotation source can be referred to the documentation link in References.\n        Number of stacking levels to use in stack ensemble. Roughly increases model training time by factor of `num_stack_levels+1` (set = 0 to disable stack ensembling).\n        Disabled by default (0), but we recommend values between 1-3 to maximize predictive performance.\n        To prevent overfitting, `num_bag_folds >= 2` must also be set or else a ValueError will be raised.\n        default = None\n    - time_limit (int, optional): Time limit for training in seconds.\n        Default is 120.\n    - random_state (int, optional): The seed used by the random number generator.\n        Default is 42.\n    --------------------------------------------------------------------------------------------\n    Returns:\n    - importance (DataFrame): DataFrame containing feature importance.\n    - leaderboard (DataFrame): DataFrame containing model performance on the test data.\n    -----------------------------------------------------------------------------------\n    References:\n    Scientific Publications:\n    - AutoGluon-Tabular: Robust and Accurate AutoML for Structured Data (Arxiv, 2020)\n    - Fast, Accurate, and Simple Models for Tabular Data via Augmented Distillation (NeurIPS, 2020)\n    - Multimodal AutoML on Structured Tables with Text Fields (ICML AutoML Workshop, 2021)\n    Articles:\n    - AutoGluon for tabular data: 3 lines of code to achieve top 1% in Kaggle competitions (AWS Open Source Blog, Mar 2020)\n    - Accurate image classification in 3 lines of code with AutoGluon (Medium, Feb 2020)\n    - AutoGluon overview & example applications (Towards Data Science, Dec 2019)\n    Documentation:\n    - https://auto.gluon.ai/0.1.0/api/autogluon.predictor.html?highlight=num_bag_folds\n    --------------------------------------------------------------------------------\n    \"\"\"\n\n    train_data, test_data = split_data(gene_data_path, class_data_path, class_name=label_column, test_size=test_size, random_state=random_state, threshold=threshold, random_feature=random_feature)\n    train_data = TabularDataset(train_data)\n    test_data = TabularDataset(test_data)\n\n    # Train the model using AutoGluon\n    predictor = TabularPredictor(label=label_column).fit(train_data, hyperparameters=hyperparameters, time_limit=time_limit, num_bag_folds=num_bag_folds, num_stack_levels=num_stack_levels)\n\n    # Get the feature importance\n    importance = predictor.feature_importance(test_data, subsample_size=None)\n\n    # Get the leaderboard of models\n    leaderboard = predictor.leaderboard(test_data)\n\n    return importance, leaderboard\n\n"
  },
  {
    "path": "TransProPy/AutogluonTimeLimit.py",
    "content": "from autogluon.tabular import TabularDataset, TabularPredictor\nfrom TransProPy.UtilsFunction2.splitdata import split_data\ndef Autogluon_TimeLimit(gene_data_path, class_data_path, label_column, test_size, threshold, random_feature=None, num_bag_folds=None, num_stack_levels=None, time_limit=120, random_state=42):\n    \"\"\"\n    2.1_autogluon_time-limit.\n    Trains a model using AutoGluon on provided data path and returns feature importance and model leaderboard.\n    ----------------------------------------------------------------------------------------------------------\n    Parameters:\n    - gene_data_path (str): Path to the gene expression data CSV file.\n        For example: '../data/gene_tpm.csv'\n    - class_data_path (str): Path to the class data CSV file.\n        For example: '../data/tumor_class.csv'\n    - label_column (str): Name of the column in the dataset that is the target label for prediction.\n    - test_size (float): Proportion of the data to be used as the test set.\n    - threshold (float): The threshold used to filter out rows based on the proportion of non-zero values.\n    - random_feature (int, optional): The number of random feature to select. If None, no random feature selection is performed.\n        Default is None.\n    - num_bag_folds (int, optional):\n       *Please note: This parameter annotation source can be referred to the documentation link in References.\n        Number of folds used for bagging of models. When `num_bag_folds = k`, training time is roughly increased by a factor of `k` (set = 0 to disable bagging).\n        Disabled by default (0), but we recommend values between 5-10 to maximize predictive performance.\n        Increasing num_bag_folds will result in models with lower bias but that are more prone to overfitting.\n        `num_bag_folds = 1` is an invalid value, and will raise a ValueError.\n        Values > 10 may produce diminishing returns, and can even harm overall results due to overfitting.\n        To further improve predictions, avoid increasing `num_bag_folds` much beyond 10 and instead increase `num_bag_sets`.\n        default = None\n    - num_stack_levels (int, optional):\n       *Please note: This parameter annotation source can be referred to the documentation link in References.\n        Number of stacking levels to use in stack ensemble. Roughly increases model training time by factor of `num_stack_levels+1` (set = 0 to disable stack ensembling).\n        Disabled by default (0), but we recommend values between 1-3 to maximize predictive performance.\n        To prevent overfitting, `num_bag_folds >= 2` must also be set or else a ValueError will be raised.\n        default = None\n    - time_limit (int, optional): Time limit for training in seconds.\n        Default is 120.\n    - random_state (int): The seed used by the random number generator.\n        Default is 42.\n    ----------------------------------------------------------------------------------\n    Returns:\n    - importance (DataFrame): DataFrame containing feature importance.\n    - leaderboard (DataFrame): DataFrame containing model performance on the test data.\n    -----------------------------------------------------------------------------------\n    References:\n    Scientific Publications:\n    - AutoGluon-Tabular: Robust and Accurate AutoML for Structured Data (Arxiv, 2020)\n    - Fast, Accurate, and Simple Models for Tabular Data via Augmented Distillation (NeurIPS, 2020)\n    - Multimodal AutoML on Structured Tables with Text Fields (ICML AutoML Workshop, 2021)\n    Articles:\n    - AutoGluon for tabular data: 3 lines of code to achieve top 1% in Kaggle competitions (AWS Open Source Blog, Mar 2020)\n    - Accurate image classification in 3 lines of code with AutoGluon (Medium, Feb 2020)\n    - AutoGluon overview & example applications (Towards Data Science, Dec 2019)\n    Documentation:\n    - https://auto.gluon.ai/0.1.0/api/autogluon.predictor.html?highlight=num_bag_folds\n    --------------------------------------------------------------------------------\n    \"\"\"\n    train_data, test_data = split_data(gene_data_path, class_data_path, class_name=label_column, test_size=test_size, random_state=random_state, threshold=threshold, random_feature=random_feature)\n    train_data = TabularDataset(train_data)\n    test_data = TabularDataset(test_data)\n\n    # Train the model using AutoGluon\n    predictor = TabularPredictor(label=label_column).fit(train_data, time_limit=time_limit, num_bag_folds=num_bag_folds, num_stack_levels=num_stack_levels)\n\n    # Get the feature importance\n    importance = predictor.feature_importance(test_data, subsample_size=None)\n\n    # Get the leaderboard of models\n    leaderboard = predictor.leaderboard(test_data)\n\n    return importance, leaderboard\n\n\n\n\n\n"
  },
  {
    "path": "TransProPy/MACFCmain.py",
    "content": "from numpy import *\nfrom TransProPy.UtilsFunction1.LoadData import load_data\nfrom TransProPy.UtilsFunction1.FeatureRanking import feature_ranking\nfrom TransProPy.UtilsFunction1.PrintResults import print_results\nfrom collections import Counter\n\ndef MACFCmain(max_rank, lable_name, threshold, data_path='../data/gene_tpm.csv', label_path='../data/tumor_class.csv'):\n    \"\"\"\n    1.1_feature_ranking_modle.\n    Applying the MACFC selection for relevant feature genes in classification.\n    --------------------------------------------------------------------------\n    Parameters:\n    max_rank: int\n        The total number of gene combinations you want to obtain.\n    lable_name: string\n        For example: gender, age, altitude, temperature, quality, and other categorical variable names.\n    data_path: string\n        For example: '../data/gene_tpm.csv'\n        Please note: Preprocess the input data in advance to remove samples that contain too many missing values or zeros.\n    label_path: string\n        For example: '../data/tumor_class.csv'\n        Please note: The input sample categories must be in a numerical binary format, such as: 1,2,1,1,2,2,1.\n        In this case, the numerical values represent the following classifications: 1: male; 2: female.\n    threshold: float\n        For example: 0.9\n        The set threshold indicates the proportion of non-zero value samples to all samples in each feature.\n    --------------------------------------------------------------------------------------------------------\n    Returns:\n    fr: list of strings\n        representing ranked features.\n    fre1: dictionary\n        feature names as keys and their frequencies as values.\n    frequency: list of tuples\n        feature names and their frequencies.\n        The frequency outputs a list sorted by occurrence frequency (in descending order). This list includes only those elements from the dictionary fre1 (which represents the counted frequencies of elements in the original data) that have an occurrence frequency greater than once, along with their frequencies.\n    len(FName): integer\n        count of AUC values greater than 0.5.\n    FName: array of strings\n        feature names after ranking with AUC > 0.5.\n    Fauc: array of floats\n        AUC values corresponding to the ranked feature names.\n    ---------------------------------------------------------\n    References:\n    - Su,Y., Du,K., Wang,J., Wei,J. and Liu,J. (2022) Multi-variable AUC for sifting complementary features and its biomedical application. Briefings in Bioinformatics, 23, bbac029.\n    -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n    \"\"\"\n    # load data\n    f, c = load_data(lable_name, threshold, data_path, label_path)\n\n    pos, neg = set(c)\n    n0, n1 = list(c).count(pos), list(c).count(neg)\n\n    FName, Fauc, fr, fre = feature_ranking(f, c, max_rank, pos, neg, n0, n1)  # Note that here n0 and n1 are passed as parameters.\n\n    fre1 = dict(Counter(fre))\n    fre2 = {key: value for key, value in fre1.items() if value > 1}\n    frequency = sorted(fre2.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)\n\n    # print_results(fr, fre1, frequency, len(FName), FName, Fauc)\n    return(fr, fre1, frequency, len(FName), FName, Fauc)\n\n\n\n\n\n\n"
  },
  {
    "path": "TransProPy/NewMACFCmain.py",
    "content": "from numpy import *\nfrom TransProPy.UtilsFunction1.LoadData import load_data\nfrom TransProPy.UtilsFunction1.NewFeatureRanking import new_feature_ranking\nfrom TransProPy.UtilsFunction1.PrintResults import print_results\nfrom collections import Counter\n\ndef New_MACFCmain(AUC_threshold, max_rank, lable_name, threshold, data_path='../data/gene_tpm.csv', label_path='../data/tumor_class.csv'):\n    \"\"\"\n    1.1_feature_ranking_modle.\n    Applying the MACFC selection for relevant feature genes in classification.\n    --------------------------------------------------------------------------\n    Parameters:\n    AUC_threshold: float\n        AUC threshold for feature selection. Features with AUC values higher than this threshold are recorded but not used in subsequent calculations.\n    max_rank: int\n        The total number of gene combinations you want to obtain.\n    lable_name: string\n        For example: gender, age, altitude, temperature, quality, and other categorical variable names.\n    data_path: string\n        For example: '../data/gene_tpm.csv'\n        Please note: Preprocess the input data in advance to remove samples that contain too many missing values or zeros.\n    label_path: string\n        For example: '../data/tumor_class.csv'\n        Please note: The input sample categories must be in a numerical binary format, such as: 1,2,1,1,2,2,1.\n        In this case, the numerical values represent the following classifications: 1: male; 2: female.\n    threshold: float\n        For example: 0.9\n        The set threshold indicates the proportion of non-zero value samples to all samples in each feature.\n    --------------------------------------------------------------------------------------------------------\n    Returns:\n    high_auc_features: list of tuples\n        This list contains tuples of feature indices and their corresponding AUC values, where the AUC value is greater than 0.95. Each tuple consists of the feature's index in string format and its AUC value as a float. This signifies that these features are highly predictive, with a strong ability to distinguish between different classes in the classification task.\n    fr: list of strings\n        representing ranked features.\n    fre1: dictionary\n        feature names as keys and their frequencies as values.\n    frequency: list of tuples\n        feature names and their frequencies.\n        The frequency outputs a list sorted by occurrence frequency (in descending order). This list includes only those elements from the dictionary fre1 (which represents the counted frequencies of elements in the original data) that have an occurrence frequency greater than once, along with their frequencies.\n    len(FName): integer\n        count of AUC values greater than 0.5.\n    FName: array of strings\n        feature names after ranking with AUC > 0.5.\n    Fauc: array of floats\n        AUC values corresponding to the ranked feature names.\n    ---------------------------------------------------------\n    References:\n    - Su,Y., Du,K., Wang,J., Wei,J. and Liu,J. (2022) Multi-variable AUC for sifting complementary features and its biomedical application. Briefings in Bioinformatics, 23, bbac029.\n    -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n    \"\"\"\n    # load data\n    f, c = load_data(lable_name, threshold, data_path, label_path)\n\n    pos, neg = set(c)\n    n0, n1 = list(c).count(pos), list(c).count(neg)\n\n    high_auc_features, FName, Fauc, fr, fre = new_feature_ranking(f, c, AUC_threshold, max_rank, pos, neg, n0, n1)  # Note that here n0 and n1 are passed as parameters.\n\n    fre1 = dict(Counter(fre))\n    fre2 = {key: value for key, value in fre1.items() if value > 1}\n    frequency = sorted(fre2.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)\n\n    # print_results(fr, fre1, frequency, len(FName), FName, Fauc)\n    return(high_auc_features, fr, fre1, frequency, len(FName), FName, Fauc)\n\n\n"
  },
  {
    "path": "TransProPy/UtilsFunction1/Auc.py",
    "content": "from numpy import *\ndef auc(tlofe, ne, n0, n1):\n    lpp = 0\n    lnp = 0\n    flag = 0\n    aac = 0\n    for i in range(-1, -size(tlofe) - 1, -1):\n        if tlofe[i] == ne:\n            if flag == 1:\n                aac += lnp * lpp\n                flag = 0\n                lpp = 0\n            lnp += 1\n        else:\n            if flag == 0:\n                flag = 1\n            lpp += 1\n    aac += lnp * lpp\n    auc = (n0 * n1 - aac) / (n0 * n1)\n    return auc\n"
  },
  {
    "path": "TransProPy/UtilsFunction1/AutoNorm.py",
    "content": "from numpy import *\ndef auto_norm(data):\n    # data:（sample,feature）\n    \"\"\"\n    Normalization Function\n        The auto_norm function is designed to normalize a two-dimensional array (matrix). The purpose of normalization is generally to bring all features into the same numerical range, facilitating subsequent analysis or model training.\n    ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n    Parameters:\n    data: ndarray\n        Order Requirements for Input Data：\n        1.This function does indeed have specific requirements for the row and column order of the input matrix data. Rows should represent individual samples, and columns should represent different features. In other words, each row vector represents a sample containing multiple features.\n        2.Each column of the matrix will be independently normalized, so different features should be placed in separate columns.\n    -----------------------------------------------------------------------------------------------------------------------------\n    Returns:\n    norm_data: ndarray\n        It is the normalized data.\n    ------------------------------\n    \"\"\"\n    mins = data.min(0)\n    maxs = data.max(0)\n    ranges = maxs - mins\n    row = data.shape[0]\n    norm_data = data - tile(mins, (row, 1))\n    norm_data = norm_data / tile(ranges, (row, 1))\n    return norm_data\n"
  },
  {
    "path": "TransProPy/UtilsFunction1/FeatureRanking.py",
    "content": "from numpy import *\nfrom TransProPy.UtilsFunction1.Auc import auc\ndef feature_ranking(f, c, max_rank, pos, neg, n0, n1):\n    f_auc = []\n    f_no = [str(i) for i in range(shape(f)[0])]\n    f_mtf = full((shape(f)[0], shape(f)[1]), False)\n    f_ne = []\n    fl = shape(f_no)[0]\n    # print('fl ', fl)\n\n\n    for j in range(fl):\n        argfv = argsort(f[j])\n        slofe = c[argfv]\n        ne = slofe[0]\n        a = auc(slofe, ne, n0, n1)\n        if a < 0.5:\n            if slofe[0] == slofe[-1]:\n                a = 1 - a\n                if ne == pos:\n                    ne = neg\n                else:\n                    ne = pos\n        f_auc.append(a)\n        f_ne.append(ne)\n\n        ml = 1\n        mr = 1\n        for i in range(1, size(slofe)):\n            if slofe[i] == slofe[0]:\n                ml += 1\n            else:\n                break\n        for i in range(-2, -size(slofe), -1):\n            if slofe[i] == slofe[-1]:\n                mr += 1\n            else:\n                break\n        mr = size(slofe) - mr\n\n        if slofe[0] == slofe[-1]:\n            if not slofe[0] == ne:\n                ml = 0\n            else:\n                mr = size(slofe)\n        f_mtf[j][argfv[ml:mr]] = True\n        # print(f_auc)\n    arg_auc = argsort(-array(f_auc))\n    FName = array(f_no)[arg_auc]\n    Fvalue = array(f)[arg_auc]\n    Fauc = array(f_auc)[arg_auc]\n    Fne = array(f_ne)[arg_auc]\n    FmTF = array(f_mtf)[arg_auc]\n    # print('SORT VALUE', Fvalue)\n    # print('SORT M', FmTF)\n    # print('SORT NAME', FName)\n    # print('SORT AUC', Fauc)\n\n    kk = 0\n    slen = 0\n    Fmcount = ones((len(FmTF[0])))\n    Fmcount = Fmcount.astype(bool)\n    for i in range(fl):\n        if Fauc[i] < 0.5:\n            kk += 1\n        Fmcount &= FmTF[i]\n        if True in Fmcount:\n            slen += 1\n    # print('Totally ', kk, ' features with auc under 0.5')\n\n    for i in range(fl):\n        if Fauc[i] < 0.5:\n            continue\n        for j in range(i + 1, fl):\n            if Fauc[j] < 0.5:\n                continue\n            nflg = 0\n            if not ((FmTF[i] & FmTF[j]) == FmTF[i]).all():\n                if not ((FmTF[i] & FmTF[j]) == FmTF[j]).all():\n                    nflg = 1\n            if nflg == 0:\n                if FmTF[i].sum() <= FmTF[j].sum():\n                    Fauc[j] = -2\n                else:\n                    Fauc[i] = -2\n                    break\n    ii = []\n    gg = []\n    for i in range(fl):\n        if Fauc[i] > 0.5:\n            ii.append(i)\n        else:\n            gg.append(i)\n    arg_auc_gg = argsort(-array(f_auc)[gg])\n    gg = [str(FName[i]) for i in array(gg)[arg_auc_gg]]\n    # print('Totally ' + str(fl - len(ii)) + ' features are covered and removed.')\n    # print(gg)\n    FName = FName[ii]\n    Fvalue = Fvalue[ii]\n    Fauc = Fauc[ii]\n    Fne = Fne[ii]\n    FmTF = FmTF[ii]\n\n    over = 0\n    if max_rank > len(ii):\n        over = max_rank - len(ii)\n        max_rank = len(ii)\n\n\n    # start ranking\n    rankset = []  # store unique features\n    ranklist = []  # with overlap\n    order = 0\n    while len(rankset) < max_rank:\n        ## start selection\n        rnk = 2\n        mv_auc = Fauc[order]\n        fs = [FName[order]]\n        cpms = FmTF[order]\n        fl = shape(FName)[0]\n\n        while mv_auc != 1:\n            ft = 0\n            temp = 0\n            for j in range(fl):\n                if FName[j] not in fs:\n                    tmpFmTF = cpms & FmTF[j]\n                    if not ((FmTF[j] & cpms) == cpms).all():\n                        mauc = 0\n                        for g in fs + [FName[j]]:\n                            fval = Fvalue[argwhere(FName == g)[0][0]][tmpFmTF]\n                            stwlofe = array(c)[tmpFmTF]\n                            argfv = argsort(fval)\n                            slofe = stwlofe[argfv]\n                            tauc = auc(slofe, Fne[argwhere(FName == g)[0][0]], n0, n1)\n                            mauc += tauc\n                        tmpauc = mauc / rnk\n                        if tmpauc > mv_auc:\n                            mv_auc = tmpauc\n                            ft = j\n                            temp = Fauc[j]\n                        elif tmpauc == mv_auc and Fauc[j] > temp:\n                            ft = j\n                            temp = Fauc[j]\n\n            if mv_auc == -2 or ft == 0:\n                break\n            fs.append(FName[ft])\n            cpms = cpms & FmTF[ft]\n            rnk += 1\n            # print('\\nRank-' + str(rnk - 1) + ' mvAUC: ' + str(mv_auc) + '  Feature set:', fs)\n\n        for i in fs:\n            ranklist.append(i)\n            if i not in rankset:\n                rankset.append(i)\n\n        order += 1\n\n    if over != 0:\n        ranklist = ranklist + list(gg)[:over]\n        rankset = rankset + list(gg)[:over]\n\n    return FName, Fauc, rankset, ranklist"
  },
  {
    "path": "TransProPy/UtilsFunction1/FilterSamples.py",
    "content": "import pandas as pd\n\ndef filter_samples(threshold, data_path='../data/gene_tpm.csv'):\n    \"\"\"\n    Remove samples with high zero expression.\n    -----------------------------------------\n    Parameters\n    data_path: string\n        For example: '../data/gene_tpm.csv'\n        Please note: The input data matrix should have genes as rows and samples as columns.\n    threshold: float\n        For example: 0.9\n        The set threshold indicates the proportion of non-zero value samples to all samples in each feature.\n    --------------------------------------------------------------------------------------------------------\n    Return\n        X: pandas.core.frame.DataFrame\n    -----------------------------------\n    \"\"\"\n    data = pd.read_csv(data_path, index_col=0, header=0)\n    # Calculate the count of non-zero values in each row.\n    non_zero_counts = data.astype(bool).sum(axis=1)\n    # Set a threshold indicating the proportion of gene expressions that are zeros.\n    # threshold = 0.9\n    # Filter rows based on the threshold.\n    X = data[non_zero_counts / data.shape[1] > threshold]\n    # Return the result.\n    return X\n"
  },
  {
    "path": "TransProPy/UtilsFunction1/GeneNames.py",
    "content": "import os\nfrom pandas import read_csv, merge\n\n\ndef gene_name(data_path='../data/gene_tpm.csv'):\n    \"\"\"\n    Extract gene_names data.\n    ------------------------\n    Parameters:\n    data_path: string\n        For example: '../data/gene_tpm.csv'\n        Please note: Preprocess the input data in advance to remove samples that contain too many missing values or zeros.\n        The input data matrix should have genes as rows and samples as columns.\n    ---------------------------------------------------------------------------\n    Return:\n    gene_name: list\n    ---------------------------------------------------------------------------\n    \"\"\"\n    # Check if the data files exist at the given paths\n    if not os.path.exists(data_path):\n        raise FileNotFoundError(\n            f\"The data file was not found at '{data_path}'. Please ensure it's in the correct location.\")\n\n    # Load data and labels\n    data = read_csv(data_path, header=0, index_col=0)  # Assuming row names are gene names\n    # Get the gene names directly from the row names (assuming row names are gene names)\n    gene_names = data.index.tolist()\n\n    return gene_names"
  },
  {
    "path": "TransProPy/UtilsFunction1/GeneToFeatureMapping.py",
    "content": "def gene_map_feature(gene_names, ranked_features):\n    \"\"\"\n    gene map feature.\n    ------------------------\n    Parameters\n    gene_names: list\n        For example: ['GeneA', 'GeneB', 'GeneC', 'GeneD', 'GeneE']\n        containing strings\n    ranked_features: list\n        For example: [2, 0, 1]\n        containing integers\n    -----------------------\n    Return\n        gene_to_feature_mapping: dictionary\n        gene_to_feature_mapping is a Python dictionary type. It is used to map gene names to their corresponding feature (or ranked feature) names.\n    -----------------------------------------------------------------------------------------------------------------------------------------------\n    \"\"\"\n    gene_to_feature_mapping = {}\n\n    for feature_index_str in ranked_features:\n        feature_index = int(feature_index_str)\n        if 0 <= feature_index < len(gene_names):\n            gene_name = gene_names[feature_index]\n            gene_to_feature_mapping[gene_name] = feature_index\n        else:\n            print(f\"Invalid feature index: {feature_index}\")\n\n    return gene_to_feature_mapping"
  },
  {
    "path": "TransProPy/UtilsFunction1/LoadData.py",
    "content": "from pandas import *\nfrom numpy import *\nimport os\nfrom TransProPy.UtilsFunction1.AutoNorm import auto_norm\nfrom TransProPy.UtilsFunction1.FilterSamples import filter_samples\ndef load_data(lable_name, threshold, data_path='../data/gene_tpm.csv', label_path='../data/tumor_class.csv'):\n    \"\"\"\n    Data Reading and Transformation.\n        Data normalization for constant value\n        Extract matrix data and categorical data.\n    ---------------------------------------------\n    Parameters:\n    lable_name: string\n        For example: gender, age, altitude, temperature, quality, and other categorical variable names.\n    data_path: string\n        For example: '../data/gene_tpm.csv'\n        Please note: Preprocess the input data in advance to remove samples that contain too many missing values or zeros.\n        The input data matrix should have genes as rows and samples as columns.\n    label_path: string\n        For example: '../data/tumor_class.csv'\n        Please note: The input CSV data should have rows representing sample names and columns representing class names.\n        The input sample categories must be in a numerical binary format, such as: 1,2,1,1,2,2,1.\n        In this case, the numerical values represent the following classifications: 1: male; 2: female.\n    threshold: float\n        For example: 0.9\n        The set threshold indicates the proportion of non-zero value samples to all samples in each feature.\n    --------------------------------------------------------------------------------------------------------\n    Returns:\n    transpose(f): ndarray\n        A transposed feature-sample matrix.\n    c: ndarray\n        A NumPy array containing classification labels.\n    ---------------------------------------------------------\n    \"\"\"\n    # Check if the data files exist at the given paths\n    if not os.path.exists(data_path):\n        raise FileNotFoundError(\n            f\"The data file was not found at '{data_path}'. Please ensure it's in the correct location.\")\n\n    if not os.path.exists(label_path):\n        raise FileNotFoundError(\n            f\"The label file was not found at '{label_path}'. Please ensure it's in the correct location.\")\n\n    # Continue with the rest of your function\n    data = filter_samples(threshold, data_path)\n    # data = read_csv(data_path, header=0, index_col=0)\n    data = data.transpose()\n    lable = read_csv(label_path, header=0, index_col=0)\n    lable = lable[lable_name]\n    data = merge(data, lable, left_index=True, right_index=True)\n    values = unique(data.values, axis=0)\n    f = auto_norm(values[:, :-1])  # data normalization for constant value\n    c = values[:, -1]\n\n    return transpose(f), c"
  },
  {
    "path": "TransProPy/UtilsFunction1/NewFeatureRanking.py",
    "content": "from numpy import *\nfrom TransProPy.UtilsFunction1.Auc import auc\n\ndef new_feature_ranking(f, c, AUC_threshold, max_rank, pos, neg, n0, n1):\n    f_auc = []\n    f_no = [str(i) for i in range(shape(f)[0])]\n    f_mtf = full((shape(f)[0], shape(f)[1]), False)\n    f_ne = []\n    fl = shape(f_no)[0]\n\n    # New addition: To store features with AUC greater than AUC_threshold and their AUC values\n\n    high_auc_features = []\n\n    # Calculate the AUC for each feature\n    for j in range(fl):\n        argfv = argsort(f[j])\n        slofe = c[argfv]\n        ne = slofe[0]\n        a = auc(slofe, ne, n0, n1)\n        if a < 0.5:\n            if slofe[0] == slofe[-1]:\n                a = 1 - a\n                if ne == pos:\n                    ne = neg\n                else:\n                    ne = pos\n        f_auc.append(a)\n        f_ne.append(ne)\n\n        # New addition: Check and record features with AUC greater than AUC_threshold.\n        if a > AUC_threshold:\n            high_auc_features.append((f_no[j], a))\n\n        # Sort high_auc_features by AUC value\n        high_auc_features = sorted(high_auc_features, key=lambda x: x[1], reverse=True)\n\n        ml = 1\n        mr = 1\n        for i in range(1, size(slofe)):\n            if slofe[i] == slofe[0]:\n                ml += 1\n            else:\n                break\n        for i in range(-2, -size(slofe), -1):\n            if slofe[i] == slofe[-1]:\n                mr += 1\n            else:\n                break\n        mr = size(slofe) - mr\n\n        if slofe[0] == slofe[-1]:\n            if not slofe[0] == ne:\n                ml = 0\n            else:\n                mr = size(slofe)\n        f_mtf[j][argfv[ml:mr]] = True\n\n    # New addition: Exclude features with AUC greater than AUC_threshold from the original set.\n    remaining_indices = [i for i, a in enumerate(f_auc) if a <= AUC_threshold]\n    remaining_f_no = [f_no[i] for i in remaining_indices]\n    remaining_f_auc = [f_auc[i] for i in remaining_indices]\n    remaining_f_mtf = [f_mtf[i] for i in remaining_indices]\n    remaining_f_ne = [f_ne[i] for i in remaining_indices]\n\n    # Update 'fl' to the number of remaining features.\n    fl = len(remaining_f_no)\n\n    # Sort and process the remaining features.\n    arg_auc = argsort(-array(remaining_f_auc))\n    FName = array(remaining_f_no)[arg_auc]\n    Fvalue = array(f)[arg_auc]\n    Fauc = array(remaining_f_auc)[arg_auc]\n    Fne = array(remaining_f_ne)[arg_auc]\n    FmTF = array(remaining_f_mtf)[arg_auc]\n\n    kk = 0\n    slen = 0\n    Fmcount = ones((len(FmTF[0])))\n    Fmcount = Fmcount.astype(bool)\n    for i in range(fl):\n        if Fauc[i] < 0.5:\n            kk += 1\n        Fmcount &= FmTF[i]\n        if True in Fmcount:\n            slen += 1\n    # print('Totally ', kk, ' features with auc under 0.5')\n\n    for i in range(fl):\n        if Fauc[i] < 0.5:\n            continue\n        for j in range(i + 1, fl):\n            if Fauc[j] < 0.5:\n                continue\n            nflg = 0\n            if not ((FmTF[i] & FmTF[j]) == FmTF[i]).all():\n                if not ((FmTF[i] & FmTF[j]) == FmTF[j]).all():\n                    nflg = 1\n            if nflg == 0:\n                if FmTF[i].sum() <= FmTF[j].sum():\n                    Fauc[j] = -2\n                else:\n                    Fauc[i] = -2\n                    break\n    ii = []\n    gg = []\n    for i in range(fl):\n        if Fauc[i] > 0.5:\n            ii.append(i)\n        else:\n            gg.append(i)\n    arg_auc_gg = argsort(-array(f_auc)[gg])\n    gg = [str(FName[i]) for i in array(gg)[arg_auc_gg]]\n    # print('Totally ' + str(fl - len(ii)) + ' features are covered and removed.')\n    # print(gg)\n    FName = FName[ii]\n    Fvalue = Fvalue[ii]\n    Fauc = Fauc[ii]\n    Fne = Fne[ii]\n    FmTF = FmTF[ii]\n\n    over = 0\n    if max_rank > len(ii):\n        over = max_rank - len(ii)\n        max_rank = len(ii)\n\n    # start ranking\n    rankset = []  # store unique features\n    ranklist = []  # with overlap\n    order = 0\n    while len(rankset) < max_rank:\n        ## start selection\n        rnk = 2\n        mv_auc = Fauc[order]\n        fs = [FName[order]]\n        cpms = FmTF[order]\n        fl = shape(FName)[0]\n\n        while mv_auc != 1:\n            ft = 0\n            temp = 0\n            for j in range(fl):\n                if FName[j] not in fs:\n                    tmpFmTF = cpms & FmTF[j]\n                    if not ((FmTF[j] & cpms) == cpms).all():\n                        mauc = 0\n                        for g in fs + [FName[j]]:\n                            fval = Fvalue[argwhere(FName == g)[0][0]][tmpFmTF]\n                            stwlofe = array(c)[tmpFmTF]\n                            argfv = argsort(fval)\n                            slofe = stwlofe[argfv]\n                            tauc = auc(slofe, Fne[argwhere(FName == g)[0][0]], n0, n1)\n                            mauc += tauc\n                        tmpauc = mauc / rnk\n                        if tmpauc > mv_auc:\n                            mv_auc = tmpauc\n                            ft = j\n                            temp = Fauc[j]\n                        elif tmpauc == mv_auc and Fauc[j] > temp:\n                            ft = j\n                            temp = Fauc[j]\n\n            if mv_auc == -2 or ft == 0:\n                break\n            fs.append(FName[ft])\n            cpms = cpms & FmTF[ft]\n            rnk += 1\n            # print('\\nRank-' + str(rnk - 1) + ' mvAUC: ' + str(mv_auc) + '  Feature set:', fs)\n\n        for i in fs:\n            ranklist.append(i)\n            if i not in rankset:\n                rankset.append(i)\n\n        order += 1\n\n    if over != 0:\n        ranklist = ranklist + list(gg)[:over]\n        rankset = rankset + list(gg)[:over]\n\n\n    # Return the features with an AUC greater than AUC_threshold, and other ranked and filtered feature information\n    return high_auc_features, FName, Fauc, rankset, ranklist\n\n\n"
  },
  {
    "path": "TransProPy/UtilsFunction1/PrintResults.py",
    "content": "def print_results(high_auc_features, fr, fre1, frequency, len_FName, FName, Fauc):\n\n    print('Ranked features (start from higher rank): ', fr)\n    print('Features and its frequency: ', fre1)\n    print('Sorted features with frequency higher than 1: ', frequency)\n    print('The count of AUC values greater than 0.5: ', len_FName)\n    print('The list of feature names after ranking (AUC > 0.5): ', FName)\n    print('The list of AUC values corresponding to the ranked feature names: ', Fauc)\n"
  },
  {
    "path": "TransProPy/UtilsFunction1/__init__.py",
    "content": ""
  },
  {
    "path": "TransProPy/UtilsFunction2/LogTransform.py",
    "content": "import numpy as np\n\ndef log_transform(data):\n    \"\"\"\n    Evaluate and potentially apply log2 transformation to data.\n    -This function checks data against a set of criteria to determine if a log2 transformation is needed, applying the transformation if necessary.\n    -----------------------------------------------------------------------------------------------------------------------------------------------\n    Parameters:\n    -data (np.ndarray): A numerical numpy array.\n    ------------------------------------------\n    Returns:\n    -result(np.ndarray): The original data or the data transformed with log2.\n    -----------------------------------------------------------------\n    \"\"\"\n    # Calculate quantiles\n    qx = np.quantile(data, [0., 0.25, 0.5, 0.75, 0.99, 1.0])\n\n    # Define conditions for log transformation\n    LogC = (qx[4] > 100) or \\\n           (qx[5] - qx[0] > 50 and qx[1] > 0) or \\\n           (qx[1] > 0 and qx[1] < 1 and qx[3] > 1 and qx[3] < 2)\n\n    # Apply log transformation based on conditions\n    if LogC:\n        data[data <= 0] = np.NaN  # Use NaN for non-applicable data\n        result = np.log2(data)\n        print(\"log2 transform finished\")\n    else:\n        result = data\n        print(\"log2 transform not needed\")\n\n    return result"
  },
  {
    "path": "TransProPy/UtilsFunction2/__init__.py",
    "content": ""
  },
  {
    "path": "TransProPy/UtilsFunction2/splitdata.py",
    "content": "import pandas as pd\nfrom sklearn.utils import shuffle\nfrom sklearn.model_selection import train_test_split\n\ndef split_data(gene_data_path, class_data_path, class_name, test_size=0.2, random_state=42, threshold=0.9, random_feature=None):\n    \"\"\"\n    Reads the gene expression and class data, processes it, and splits it into training and testing sets.\n    -----------------------------------------------------------------------------------------------------\n    Parameters:\n    - gene_data_path (str): Path to the CSV file containing the gene expression data.\n        For example: '../data/gene_tpm.csv'\n    - class_data_path (str): Path to the CSV file containing the class data.\n        For example: '../data/tumor_class.csv'\n    - class_name (str): The name of the class column in the class data.\n    - test_size (float, optional): The proportion of the data to be used as the testing set. Default is 0.2.\n    - random_state (int, optional): The seed used by the random number generator. Default is 42.\n    - threshold (float, optional): The threshold used to filter out rows based on the proportion of non-zero values. Default is 0.9.\n    - random_feature (int, optional): The number of random feature to select. If None, no random feature selection is performed. Default is None.\n    ---------------------------------------------------------------------------------------------------------------------------------------------\n    Returns:\n    - train_data (pd.DataFrame): The training data.\n    - test_data (pd.DataFrame): The testing data.\n    ---------------------------------------------\n    \"\"\"\n\n    # Reading the data\n    X = pd.read_csv(gene_data_path, index_col=0, header=0)\n    y = pd.read_csv(class_data_path, index_col=0, header=0)\n\n    # Finding common sample names between X(column names) and y(row names)\n    common = X.columns.intersection(y.index)\n\n    # Filtering out low-quality data\n    non_zero_counts = X.astype(bool).sum(axis=1)\n    X = X[non_zero_counts / X.shape[1] > threshold]\n\n    # If random_sample is specified, perform random sampling on X\n    if random_feature is not None:\n        X = X.sample(n=random_feature, random_state=random_state)\n\n    # Keeping only the common samples in X and y\n    X = X.loc[:, common]\n    y = y.loc[common]\n\n    # Transposing X and merging it with the specified column from y\n    X = X.transpose()\n    Y = y[class_name]\n    data = pd.merge(X, Y, left_index=True, right_index=True)\n\n    # data is a DataFrame containing features and labels\n    # First, randomize the data\n    data = shuffle(data, random_state=42)\n\n    # Splitting the data into training and validation sets\n    # Then perform stratified sampling\n    train_data, test_data = train_test_split(data, test_size=test_size, random_state=random_state, stratify=data[class_name])\n\n    return train_data, test_data\n\n"
  },
  {
    "path": "TransProPy/UtilsFunction3/EnsembleForRFE.py",
    "content": "from sklearn.base import BaseEstimator\nfrom sklearn.svm import SVC\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.ensemble import GradientBoostingClassifier\nimport numpy as np\n\n\nclass EnsembleForRFE(BaseEstimator):\n    \"\"\"\n    Ensemble estimator for recursive feature elimination.\n\n    Parameters:\n    - svm_C: float, regularization parameter for SVM.\n    - tree_max_depth: int, maximum depth of the decision tree.\n    - tree_min_samples_split: int, minimum number of samples required to split an internal node.\n    - gbm_learning_rate: float, learning rate for gradient boosting.\n    - gbm_n_estimators: int, number of boosting stages to be run for gradient boosting.\n    \"\"\"\n\n    def __init__(self, svm_C=1.0, tree_max_depth=None,\n                 tree_min_samples_split=2, gbm_learning_rate=0.1,\n                 gbm_n_estimators=100):\n        # Save passed parameters as class attributes\n        self.svm_C = svm_C\n        self.tree_max_depth = tree_max_depth\n        self.tree_min_samples_split = tree_min_samples_split\n        self.gbm_learning_rate = gbm_learning_rate\n        self.gbm_n_estimators = gbm_n_estimators\n\n        # Initialize individual models with the specified parameters\n        self.svm = SVC(kernel=\"linear\", probability=True, C=self.svm_C)\n        self.tree = DecisionTreeClassifier(max_depth=self.tree_max_depth,\n                                           min_samples_split=self.tree_min_samples_split)\n        self.gbm = GradientBoostingClassifier(learning_rate=self.gbm_learning_rate,\n                                              n_estimators=self.gbm_n_estimators)\n\n        self.feature_importances_ = None # Initialize feature importances attribute\n\n    def fit(self, X, y):\n        \"\"\"\n        Fit the individual models and compute aggregated feature importances.\n\n        Parameters:\n        - X: DataFrame, Feature dataset with shape (n_samples, n_features).\n        - y: ndarray, 1-D array of target values with shape (n_samples,).\n\n        Returns:\n        - self: object, Instance of the model.\n        \"\"\"\n        # Fit individual models\n        self.svm.fit(X, y)\n        self.tree.fit(X, y)\n        self.gbm.fit(X, y)\n\n        # Calculate feature importances and store as attributes\n        svm_importances = np.abs(self.svm.coef_[0])\n        tree_importances = self.tree.feature_importances_\n        gbm_importances = self.gbm.feature_importances_\n\n        # Average feature importances\n        self.feature_importances_ = (svm_importances + tree_importances + gbm_importances) / 3\n        return self\n\n    def predict(self, X):\n        \"\"\"\n        Predict class labels for samples in X using a soft voting mechanism.\n\n        Parameters:\n        - X: DataFrame, Input features.\n\n        Returns:\n        - Predicted class labels.\n        \"\"\"\n        # Get the probability predictions from individual models\n        probabilities = np.array([self.svm.predict_proba(X),\n                                  self.tree.predict_proba(X),\n                                  self.gbm.predict_proba(X)])\n\n        # Average probabilities for soft voting\n        avg_prob = np.mean(probabilities, axis=0)\n\n        # Predict class labels based on the highest probability\n        return np.argmax(avg_prob, axis=1)\n\n    def set_params(self, **params):\n        \"\"\"\n        Set parameters for the ensemble estimator. This will be used by hyperparameter\n        optimization methods like RandomizedSearchCV to update the parameters of the\n        individual models.\n\n        Parameters:\n        - **params: Keyword arguments for parameter names and values.\n        \"\"\"\n        # Update the parameter values based on provided keyword arguments\n        for key, value in params.items():\n            if key in ['svm_C', 'tree_max_depth',\n                       'tree_min_samples_split', 'gbm_learning_rate',\n                       'gbm_n_estimators']:\n                setattr(self, key, value)\n\n        # Re-initialize the models with the updated parameters\n        self.svm = SVC(kernel=\"linear\", probability=True, C=self.svm_C)\n        self.tree = DecisionTreeClassifier(max_depth=self.tree_max_depth,\n                                           min_samples_split=self.tree_min_samples_split)\n        self.gbm = GradientBoostingClassifier(learning_rate=self.gbm_learning_rate,\n                                              n_estimators=self.gbm_n_estimators)\n        return self\n"
  },
  {
    "path": "TransProPy/UtilsFunction3/ExtractAndSaveResults.py",
    "content": "# TransProPy.UtilsFunction3.extract_and_save_results.py\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom TransProPy.UtilsFunction3.PrintBoxedText import print_boxed_text\nfrom sklearn.metrics import roc_curve, roc_auc_score\nfrom sklearn.model_selection import cross_val_predict\nfrom sklearn.model_selection import StratifiedKFold\n\ndef extract_and_save_results(\n        clf,\n        X,\n        Y,\n        save_path,\n        n_cv,\n        show_plot=False,\n        use_tkagg=False):\n    \"\"\"\n    Extract and save various results from the trained model.\n\n    Parameters:\n    - clf: trained model (RandomizedSearchCV object).\n    - X: DataFrame, feature data used for training.\n    - save_path: str, base path for saving results.\n    - show_plot: bool, whether to display the plot.\n    - use_tkagg: bool, whether to use 'TkAgg' backend for matplotlib. Generally, choose True when using in PyCharm IDE, and choose False when rendering file.qmd to an HTML file.\n    \"\"\"\n\n    # Setting the matplotlib backend to 'TkAgg' if specified\n    if use_tkagg:\n        import matplotlib\n        matplotlib.use('TkAgg')\n\n    # Extracting cross-validation results\n    cv_results = clf.cv_results_\n    mean_test_scores = cv_results['mean_test_score'] # Calculate the average test score for each iteration\n    n_iterations = len(mean_test_scores)\n\n    # Plotting and saving the accuracy per iteration figure\n    plt.figure(figsize=(6, 4), facecolor='#f0f8fe')\n    plt.plot(range(1, n_iterations + 1), mean_test_scores, marker='o')\n    plt.title('Model Accuracy per Iteration')\n    plt.xlabel('Iteration')\n    plt.ylabel('Mean Test Accuracy')\n    plt.grid(True, color='#11479c', alpha=0.2)\n    # Get the current axes (ax), and set the background color of the plot area to white\n    ax = plt.gca()\n    ax.set_facecolor('#e1f0fb')\n    # Call tight_layout to automatically adjust the layout\n    plt.tight_layout()\n    plt.savefig(save_path + \"Model_Accuracy_per_Iteration_figure.pdf\", format='pdf')\n    # Optionally display the plot\n    if show_plot:\n        plt.show()\n\n    # plotting the ROC curve\n    # Predict probabilities\n    y_probas = cross_val_predict(clf.best_estimator_, X, Y, cv=StratifiedKFold(n_splits=n_cv), method='predict_proba')\n    # Take the probability of the positive class\n    y_scores = y_probas[:, 1]\n    # Calculate values for the ROC curve\n    fpr, tpr, thresholds = roc_curve(Y, y_scores)\n    # Calculate AUC value\n    roc_auc = roc_auc_score(Y, y_scores)\n\n    # Plot and save the ROC curve\n    plt.figure(figsize=(6, 4), facecolor='#f0f8fe')\n    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)\n    plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line for a random classifier\n    plt.xlabel('False Positive Rate')\n    plt.ylabel('True Positive Rate')\n    plt.title('ROC Curve')\n    plt.legend(loc='lower right')\n    # Get the current axes (ax), and set the background color of the plot area to white\n    ax = plt.gca()\n    ax.set_facecolor('#e1f0fb')\n    # Call tight_layout to automatically adjust the layout\n    plt.tight_layout()\n    plt.savefig(save_path + \"ROC_Curve_figure.pdf\", format='pdf')\n    # Optionally display the plot\n    if show_plot:\n        plt.show()\n\n\n    # Extracting feature selection results\n    feature_union = clf.best_estimator_.named_steps['feature_selection']\n    rfecv = feature_union.transformer_list[0][1]\n    selectkbest = feature_union.transformer_list[1][1]\n    selected_features_rfecv = rfecv.support_\n    selected_features_selectkbest = selectkbest.get_support()\n    # Print selected features\n    print_boxed_text(\"Features selected by RFECV:\")\n    print(X.columns[selected_features_rfecv])\n    print_boxed_text(\"Features selected by SelectKBest:\")\n    print(X.columns[selected_features_selectkbest])\n\n    # Combining and saving selected features\n    combined_selected_features = np.logical_or(selected_features_rfecv, selected_features_selectkbest)\n    combined_features_df = pd.DataFrame({'Feature': X.columns[combined_selected_features]})\n    combined_features_df.to_csv(save_path + 'combined_features.csv', index=False)\n    print_boxed_text(f\"Total number of selected features: {combined_features_df.shape[0]}\")\n\n    # Extracting and saving EnsembleForRFE feature importances\n    ensemble_for_rfe = feature_union.transformer_list[0][1].estimator_\n    feature_importances_ensemble = ensemble_for_rfe.feature_importances_\n    importances_ensemble = zip(X.columns[selected_features_rfecv], feature_importances_ensemble)\n    sorted_importances_ensemble = sorted(importances_ensemble, key=lambda x: x[1], reverse=True)\n    df_importances_ensemble = pd.DataFrame(sorted_importances_ensemble, columns=['Feature', 'Importance'])\n    df_importances_ensemble.to_csv(save_path + 'ensemble_importances.csv', index=False)\n    print_boxed_text(\"Feature Importances from EnsembleForRFE:\")\n    print(df_importances_ensemble)\n\n    # Extracting and saving SelectKBest scores\n    selectkbest_scores = selectkbest.scores_[selected_features_selectkbest]\n    scores_selectkbest = zip(X.columns[selected_features_selectkbest], selectkbest_scores)\n    sorted_scores_selectkbest = sorted(scores_selectkbest, key=lambda x: x[1], reverse=True)\n    df_scores_selectkbest = pd.DataFrame(sorted_scores_selectkbest, columns=['Feature', 'Score'])\n    df_scores_selectkbest.to_csv(save_path + 'selectkbest_scores.csv', index=False)\n    print_boxed_text(\"Scores from SelectKBest:\")\n    print(df_scores_selectkbest)"
  },
  {
    "path": "TransProPy/UtilsFunction3/ExtractCommonSamples.py",
    "content": "def extract_common_samples(X, Y):\n    \"\"\"\n    Extracts common samples (rows) from two DataFrames based on their indices.\n\n    Parameters:\n    X (pd.DataFrame): First DataFrame.\n    Y (pd.DataFrame): Second DataFrame.\n\n    Returns:\n    pd.DataFrame, pd.DataFrame: Two DataFrames containing only the rows that are common in both.\n    \"\"\"\n    # Find common indices\n    common_indices = X.index.intersection(Y.index)\n\n    # Filter both DataFrames to keep only common rows\n    X_common = X.loc[common_indices]\n    Y_common = Y.loc[common_indices]\n\n    return X_common, Y_common"
  },
  {
    "path": "TransProPy/UtilsFunction3/LoadAndPreprocessData.py",
    "content": "# TransProPy.UtilsFunction3.load_and_preprocess_data.py\n\nfrom TransProPy.UtilsFunction3.LoadFilterTranspose import load_filter_transpose\nfrom TransProPy.UtilsFunction3.LoadEncodeLabels import load_encode_labels\nfrom TransProPy.UtilsFunction3.ExtractCommonSamples import extract_common_samples\n\n\ndef load_and_preprocess_data(feature_file, label_file, label_column, threshold):\n    \"\"\"\n    Load and preprocess the data.\n\n    Parameters:\n    - feature_file: str, path to the feature data file.\n    - label_file: str, path to the label data file.\n    - label_column: str, column name of the labels in the label file.\n    - threshold: float, threshold for filtering in load_filter_transpose function.\n\n    Returns:\n    - X: DataFrame, preprocessed feature data.\n    - Y: ndarray, preprocessed label data.\n    \"\"\"\n    X = load_filter_transpose(threshold, feature_file)  # Load and filter features\n    Y = load_encode_labels(label_file, label_column)  # Load and encode labels\n    X, Y = extract_common_samples(X, Y)  # Extract common samples\n    Y = Y.values.ravel()  # Flatten Y to 1D array\n    return X, Y\n"
  },
  {
    "path": "TransProPy/UtilsFunction3/LoadEncodeLabels.py",
    "content": "from sklearn.preprocessing import LabelEncoder\nimport pandas as pd\n\ndef load_encode_labels(file_path, column_name):\n    \"\"\"\n    Reads a CSV file containing labels and encodes categorical labels in the specified column to numeric labels.\n\n    Parameters:\n    file_path (str): Path to the CSV file containing labels.\n    column_name (str): Name of the column to be encoded.\n\n    Returns:\n    Y (pd.DataFrame): A DataFrame containing the encoded numeric labels.\n    \"\"\"\n\n    # Load the data\n    y = pd.read_csv(file_path, index_col=0, header=0)\n\n    # Check if the specified column exists in the DataFrame\n    if column_name not in y.columns:\n        raise ValueError(f\"Column '{column_name}' not found in the DataFrame\")\n\n    # Create an instance of LabelEncoder\n    le = LabelEncoder()\n\n    # Apply LabelEncoder to the specified column\n    y_encoded = le.fit_transform(y[column_name]) # Many Scikit-learn models require Y to be numerical. Therefore, if Y is categorical, use the fit_transform method of LabelEncoder to convert the character labels of Y into integers.\n\n    # Convert the encoded labels back to a DataFrame\n    Y = pd.DataFrame(y_encoded, index=y.index, columns=[column_name])\n\n    return Y\n\n\n"
  },
  {
    "path": "TransProPy/UtilsFunction3/LoadFilterTranspose.py",
    "content": "import pandas as pd\n\ndef load_filter_transpose(threshold, data_path='../data/gene_tpm.csv'):\n    \"\"\"\n    Remove samples with high zero expression.\n    -----------------------------------------\n    Parameters\n    data_path: string\n        For example: '../data/gene_tpm.csv'\n        Please note: The input data matrix should have genes as rows and samples as columns.\n    threshold: float\n        For example: 0.9\n        The set threshold indicates the proportion of non-zero value samples to all samples in each feature.\n    --------------------------------------------------------------------------------------------------------\n    Return\n        X: pandas.core.frame.DataFrame\n    -----------------------------------\n    \"\"\"\n    data = pd.read_csv(data_path, index_col=0, header=0)\n    # Calculate the count of non-zero values in each row.\n    non_zero_counts = data.astype(bool).sum(axis=1)\n    # Set a threshold indicating the proportion of gene expressions that are zeros.\n    # threshold = 0.9\n    # Filter rows based on the threshold.\n    X = data[non_zero_counts / data.shape[1] > threshold]\n    X = X.transpose()\n    # Return the result.\n    return X\n"
  },
  {
    "path": "TransProPy/UtilsFunction3/LoggingCustomScorer.py",
    "content": "from sklearn.metrics import accuracy_score\nimport logging\nimport time\n\ndef logging_custom_scorer(n_iter=10, n_cv=5):\n    \"\"\"\n    Creates a custom scorer function for use in model evaluation processes.\n    This scorer logs both the accuracy score and the time taken for each call.\n\n    Parameters:\n    n_iter (int): Number of iterations for the search process. Default is 10.\n    n_cv (int): Number of cross-validation splits. Default is 5.\n\n    Returns:\n    function: A custom scorer function that logs the accuracy score and time taken for each call.\n    \"\"\"\n\n    # Initialize the time for the first call\n    last_time = time.time()\n\n    def custom_scorer(y_true, y_pred):\n        \"\"\"\n        Inner function to calculate the accuracy score, log it, and measure the time taken.\n\n        Parameters:\n        y_true (array-like): True labels.\n        y_pred (array-like): Predicted labels by the model.\n\n        Returns:\n        float: The accuracy score.\n        \"\"\"\n        nonlocal last_time  # Reference the last_time from the outer scope\n\n        # Record the current time and calculate the elapsed time since the last call\n        current_time = time.time()\n        elapsed = current_time - last_time\n        last_time = current_time  # Update last_time for the next call\n\n        # Calculate the accuracy score\n        score = accuracy_score(y_true, y_pred)\n\n        # Log the accuracy and the time taken for this scoring iteration\n        logging.info(f\"One scoring iteration completed, accuracy: {score}, time taken: {elapsed:.2f} seconds\")\n\n        return score\n\n    return custom_scorer\n\n"
  },
  {
    "path": "TransProPy/UtilsFunction3/PrintBoxedText.py",
    "content": "def print_boxed_text(title):\n    \"\"\"\n    Prints a title in a boxed format.\n\n    This function creates a box around the given title text using hash (#) and\n    equals (=) symbols. It prints the title with a border on the top and bottom,\n    making it stand out in the console output.\n\n    Parameters:\n    - title: str, the text to be displayed inside the box.\n\n    Returns:\n    None. This function directly prints the formatted title to the console.\n    \"\"\"\n    # Create the top and bottom border line of the box.\n    # The border line consists of a hash symbol, followed by equals symbols\n    # the length of the title plus two (for padding), and then another hash symbol.\n    border_line = \"#\" + \"=\" * (len(title) + 2) + \"#\"\n\n    # Print the top border line.\n    print(\"\\n\" + border_line)\n\n    # Print the title, surrounded by hash symbols and padded with one space on each side.\n    print(f\"# {title} #\")\n\n    # Print the bottom border line.\n    print(border_line)\n"
  },
  {
    "path": "TransProPy/UtilsFunction3/SetupFeatureSelection.py",
    "content": "# TransProPy.UtilsFunction3.setup_feature_selection.py\n\nfrom sklearn.feature_selection import RFECV, SelectKBest, mutual_info_classif\nfrom sklearn.model_selection import StratifiedKFold\nfrom sklearn.pipeline import FeatureUnion\nfrom TransProPy.UtilsFunction3.EnsembleForRFE import EnsembleForRFE\n\n\ndef setup_feature_selection():\n    \"\"\"\n    Set up the feature selection process.\n\n    Returns:\n    - feature_selection: FeatureUnion, combined feature selection process.\n    \"\"\"\n    ensemble_estimator = EnsembleForRFE()\n    rfecv = RFECV(estimator=ensemble_estimator, cv=StratifiedKFold(5), scoring='accuracy')\n    selectkbest = SelectKBest(score_func=mutual_info_classif)\n    return FeatureUnion([(\"rfecv\", rfecv), (\"selectkbest\", selectkbest)])\n"
  },
  {
    "path": "TransProPy/UtilsFunction3/SetupLoggingAndProgressBar.py",
    "content": "import logging\nfrom tqdm import tqdm\n\ndef setup_logging_and_progress_bar(n_iter, n_cv):\n    \"\"\"\n    Set up logging and initialize a tqdm progress bar.\n\n    Parameters:\n    n_iter (int): Number of iterations for RandomizedSearchCV.\n    n_cv (int): Number of cross-validation folds.\n\n    Returns:\n    tqdm object: An initialized tqdm progress bar.\n    \"\"\"\n\n    # Configure basic logging - this time, without filename and filemode\n    logging.basicConfig(level=logging.INFO,\n                        format='%(asctime)s - %(levelname)s: %(message)s')\n\n    # Create a file handler for logging to a file\n    file_handler = logging.FileHandler('progress.log', mode='w')\n    file_handler.setLevel(logging.INFO)\n    file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s: %(message)s'))\n\n    # Create a stream handler for logging to the console\n    # stream_handler = logging.StreamHandler()\n    # stream_handler.setLevel(logging.INFO)\n    # stream_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s: %(message)s'))\n\n    # Get the default logger and add the two handlers to it\n    logger = logging.getLogger()\n    logger.addHandler(file_handler)\n    # logger.addHandler(stream_handler)\n\n    # Calculate total iterations\n    total_iterations = n_iter * n_cv\n\n    # Initialize and return tqdm progress bar\n    pbar = tqdm(total=total_iterations, desc='RandomizedSearchCV Progress')\n    return pbar\n"
  },
  {
    "path": "TransProPy/UtilsFunction3/TqdmCustomScorer.py",
    "content": "from sklearn.metrics import make_scorer, accuracy_score\nfrom tqdm import tqdm\n\ndef tqdm_custom_scorer(n_iter=10, n_cv=5):\n    \"\"\"\n    This function creates a custom scorer for use in model evaluation processes like\n    RandomizedSearchCV. It integrates a progress bar to track the evaluation process.\n\n    Parameters:\n    n_iter (int): Number of iterations for the search process. Default is 10.\n    n_cv (int): Number of cross-validation splits. Default is 5.\n\n    Returns:\n    function: A custom scorer function that can be used with model evaluation methods.\n    \"\"\"\n\n    # Initialize a tqdm progress bar with a total count based on the number of iterations and CV splits\n    pbar = tqdm(total=n_iter * n_cv, desc='RandomizedSearchCV progress')\n\n    # Define an inner function that will be used as the scorer\n    def custom_scorer(y_true, y_pred):\n        \"\"\"\n        Inner function to calculate the accuracy score and update the progress bar.\n\n        Parameters:\n        y_true (array-like): True labels.\n        y_pred (array-like): Predicted labels by the model.\n\n        Returns:\n        float: The accuracy score.\n        \"\"\"\n        # Calculate the accuracy score\n        score = accuracy_score(y_true, y_pred)\n        # Update the progress bar\n        pbar.update()\n        return score\n\n    # Return the custom scorer function\n    return custom_scorer\n\n\n\n\n"
  },
  {
    "path": "TransProPy/UtilsFunction3/TrainModel.py",
    "content": "# TransProPy.UtilsFunction3.train_model.py\n\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.model_selection import StratifiedKFold, RandomizedSearchCV\nfrom sklearn.metrics import make_scorer\nfrom TransProPy.UtilsFunction3.LoggingCustomScorer import logging_custom_scorer\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.ensemble import StackingClassifier\nfrom sklearn.svm import SVC\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.ensemble import GradientBoostingClassifier\nfrom sklearn.linear_model import LogisticRegression\n\ndef train_model(X, Y, feature_selection, parameters, n_iter, n_cv, n_jobs=9):\n    \"\"\"\n    Set up and run the model training process.\n\n    Parameters:\n    - X: DataFrame, feature data.\n    - Y: ndarray, label data.\n    - feature_selection: FeatureUnion, the feature selection process.\n    - parameters: dict, parameters for RandomizedSearchCV.\n    - n_iter: int, number of iterations for RandomizedSearchCV.\n    - n_cv: int, number of cross-validation folds.\n    - n_jobs: int, number of jobs to run in parallel (default is 9).\n\n    Returns:\n    - clf: RandomizedSearchCV object after fitting.\n    \"\"\"\n    feature_selection_pipeline = Pipeline([\n        ('scale', StandardScaler()),\n        ('feature_selection', feature_selection),\n        ('stacking', StackingClassifier(\n            estimators=[\n                ('svm', SVC(probability=True)),\n                ('dt', DecisionTreeClassifier()),\n                ('gbm', GradientBoostingClassifier())\n            ],\n            final_estimator=LogisticRegression()))\n    ])\n\n    clf = RandomizedSearchCV(\n        feature_selection_pipeline,\n        parameters,\n        cv=StratifiedKFold(n_splits=n_cv),\n        scoring=make_scorer(logging_custom_scorer(n_iter=n_iter, n_cv=n_cv)),\n        n_iter=n_iter,\n        random_state=0,\n        error_score='raise',\n        n_jobs=n_jobs  # Use the customizable n_jobs parameter\n    )\n    return clf\n"
  },
  {
    "path": "TransProPy/UtilsFunction3/UpdateProgressBar.py",
    "content": "def update_progress_bar(pbar, log_file='progress.log'):\n    \"\"\"\n    Read the number of log entries in the log file and update the tqdm progress bar.\n\n    Parameters:\n    pbar (tqdm): The tqdm progress bar object.\n    log_file (str): Path to the log file, default is 'progress.log'.\n    \"\"\"\n\n    def count_logged_iterations():\n        \"\"\"Read and return the number of log entries in the log file.\"\"\"\n        with open(log_file, 'r') as file:\n            return sum(1 for _ in file)\n\n    # Read the log file and update the progress bar\n    logged_iterations = count_logged_iterations()\n    pbar.update(logged_iterations - pbar.n)  # Only increase by the number of new iterations logged\n"
  },
  {
    "path": "TransProPy/UtilsFunction3/__init__.py",
    "content": ""
  },
  {
    "path": "TransProPy/__init__.py",
    "content": ""
  },
  {
    "path": "setup.py",
    "content": "from setuptools import setup, find_packages\n\nsetup(\n    name='transpropy',\n    version='1.0.0',\n    packages=find_packages(),\n    install_requires=[\n        \"numpy\",\n        \"pandas\",\n        \"setuptools\",\n        \"scikit-learn\",\n        \"tqdm\"\n    ],\n    url='https://github.com/SSSYDYSSS/TransProPy',\n    author='Yu Dongyue',\n    author_email='yudongyue@mail.nankai.edu.cn',\n    description='A collection of deep learning models that integrate algorithms and various machine learning approaches to extract features (genes) effective for classification and attribute them accordingly.'\n)"
  }
]