Repository: mle-infrastructure/mle-monitor Branch: main Commit: 8030875516d1 Files: 47 Total size: 317.2 KB Directory structure: gitextract_957adg6p/ ├── .codecov.yml ├── .gitignore ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── examples/ │ ├── base_config.json │ ├── getting_started.ipynb │ ├── protocol_gcs.py │ ├── protocol_local.py │ ├── run_infrastructure.py │ └── train.py ├── mle_monitor/ │ ├── __init__.py │ ├── _version.py │ ├── dashboard/ │ │ ├── __init__.py │ │ ├── components/ │ │ │ ├── __init__.py │ │ │ ├── plots.py │ │ │ ├── protocol.py │ │ │ └── usage.py │ │ ├── layout.py │ │ └── update.py │ ├── mle_dashboard.py │ ├── mle_protocol.py │ ├── mle_resource.py │ ├── protocol/ │ │ ├── __init__.py │ │ ├── add.py │ │ ├── gcs_sync.py │ │ ├── load.py │ │ ├── summary.py │ │ └── tables.py │ ├── resource/ │ │ ├── __init__.py │ │ ├── gcp.py │ │ ├── local.py │ │ ├── sge.py │ │ └── slurm.py │ └── utils/ │ ├── __init__.py │ ├── gcs_zip.py │ ├── helpers.py │ └── tracker.py ├── requirements/ │ ├── requirements-examples.txt │ ├── requirements-test.txt │ └── requirements.txt ├── setup.py └── tests/ ├── fixtures/ │ └── base_config.json ├── test_dashboard.py ├── test_protocol.py └── test_resource.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .codecov.yml ================================================ codecov: require_ci_to_pass: yes coverage: precision: 2 round: down range: "30...70" coverage: status: project: # settings affecting project coverage default: threshold: 5% # allow for 5% reduction of coverage without failing comment: layout: "reach, diff, files" behavior: default require_changes: true ignore: - "tests/.*" - "_version.py" - "setup.py" - "examples/*" - "docs/" ================================================ FILE: .gitignore ================================================ mle_protocol.db *.yaml retrieved_log_dir logs_search old.py *.zip .vim-arsync .DS_Store __pycache__ .sync-config.cson .ipynb_checkpoints *.egg-info data/ *.key tboards/ *.ckpt .pytest_cache/ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class experiment_dir # C extensions *.so examples/multi_seed_dir examples/multi_config_dir examples/every_k_dir examples/top_k_dir examples/post_plot_dir # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ ================================================ FILE: CHANGELOG.md ================================================ ## [v0.0.2] - [03/2022] ### Changed - Restructure requirements and installations. - Update notebook. ## [v0.0.1] - [12/09/2021] ### Added Basic API for `MLEProtocol`, `MLEResource` & `MLEDashboard`: ```python from mle_monitor import MLEProtocol # Load protocol database or create new one -> print summary protocol_db = MLEProtocol("mle_protocol.db", verbose=False) protocol_db.summary(tail=10, verbose=True) # Draft data to store in protocol & add it to the protocol meta_data = { "purpose": "Grid search", # Purpose of experiment "project_name": "MNIST", # Project name of experiment "experiment_type": "hyperparameter-search", # Type of experiment "experiment_dir": "experiments/logs", # Experiment directory "num_total_jobs": 10, # Number of total jobs to run ... } new_experiment_id = protocol_db.add(meta_data) # ... train your 10 (pseudo) networks/complete respective jobs for i in range(10): protocol_db.update_progress_bar(new_experiment_id) # Wrap up an experiment (store completion time, etc.) protocol_db.complete(new_experiment_id) ``` ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing to `mle-monitor` We love your input! We want to make contributing to this project as easy and transparent as possible, whether it's: - Reporting a bug - Discussing the current state of the code - Submitting a fix - Proposing new features - Becoming a maintainer ## We Develop with Github We use github to host code, to track issues and feature requests, as well as accept pull requests. ## We Use [Github Flow](https://guides.github.com/introduction/flow/index.html), So All Code Changes Happen Through Pull Requests Pull requests are the best way to propose changes to the codebase (we use [Github Flow](https://guides.github.com/introduction/flow/index.html)). We actively welcome your pull requests: 1. Fork the repo and create your branch from `master`. 2. If you've added code that should be tested, add tests. 3. If you've changed APIs, update the documentation. 4. Ensure the test suite passes. 5. Make sure your code lints. 6. Issue that pull request! ## Any contributions you make will be under the MIT Software License In short, when you submit code changes, your submissions are understood to be under the same [MIT License](http://choosealicense.com/licenses/mit/) that covers the project. Feel free to contact the maintainers if that's a concern. ## Report bugs using Github's [issues](https://github.com/mle-infrastructure/mle-monitor/issues) We use GitHub issues to track public bugs. Report a bug by [opening a new issue](); it's that easy! ## Write bug reports with detail, background, and sample code **Great Bug Reports** tend to have: - A quick summary and/or background - Steps to reproduce - Be specific! - Give sample code if you can. - What you expected would happen - What actually happens - Notes (possibly including why you think this might be happening, or stuff you tried that didn't work) ## Use the Black Coding Style The codebase follows the [Black](https://black.readthedocs.io/en/stable/the_black_code_style/current_style.html) coding style. Using a autoformatter can make your life easier! ## License By contributing, you agree that your contributions will be licensed under its MIT License. ## References This document was adapted from the open-source contribution guidelines for [Facebook's Draft](https://github.com/facebook/draft-js/blob/a9316a723f9e918afde44dea68b5f9f39b7d9b00/CONTRIBUTING.md) and from the [Transcriptase adapted version](https://gist.github.com/briandk/3d2e8b3ec8daf5a27a62). ================================================ FILE: LICENSE ================================================ Copyright (c) 2018 The Python Packaging Authority Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Lightweight Experiment & Resource Monitoring 📺 [![Pyversions](https://img.shields.io/pypi/pyversions/mle-monitor.svg?style=flat-square)](https://pypi.python.org/pypi/mle-monitor) [![PyPI version](https://badge.fury.io/py/mle-monitor.svg)](https://badge.fury.io/py/mle-monitor) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![codecov](https://codecov.io/gh/mle-infrastructure/mle-monitor/branch/main/graph/badge.svg?token=75FIYZG8BD)](https://codecov.io/gh/mle-infrastructure/mle-monitor) [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mle-infrastructure/mle-monitor/blob/main/examples/getting_started.ipynb) "Did I already run this experiment before? How many resources are currently available on my cluster?" If these are common questions you encounter during your daily life as a researcher, then `mle-monitor` is made for you. It provides a lightweight API for tracking your experiments using a pickle protocol database (e.g. for hyperparameter searches and/or multi-configuration/multi-seed runs). Furthermore, it comes with built-in resource monitoring on Slurm/Grid Engine clusters and local machines/servers. `mle-monitor` provides three core functionalities: - **`MLEProtocol`**: A composable protocol database API for ML experiments. - **`MLEResource`**: A tool for obtaining server/cluster usage statistics. - **`MLEDashboard`**: A dashboard visualizing resource usage & experiment protocol. To get started I recommend checking out the [colab notebook](https://colab.research.google.com/github/mle-infrastructure/mle-monitor/blob/main/examples/getting_started.ipynb) and an [example workflow](https://github.com/mle-infrastructure/mle-monitor/blob/main/examples/run_infrastructure.py). drawing ## `MLEProtocol`: Keeping Track of Your Experiments 📝 ```python from mle_monitor import MLEProtocol # Load protocol database or create new one -> print summary protocol_db = MLEProtocol("mle_protocol.db", verbose=False) protocol_db.summary(tail=10, verbose=True) # Draft data to store in protocol & add it to the protocol meta_data = { "purpose": "Grid search", # Purpose of experiment "project_name": "MNIST", # Project name of experiment "experiment_type": "hyperparameter-search", # Type of experiment "experiment_dir": "experiments/logs", # Experiment directory "num_total_jobs": 10, # Number of total jobs to run ... } new_experiment_id = protocol_db.add(meta_data) # ... train your 10 (pseudo) networks/complete respective jobs for i in range(10): protocol_db.update_progress_bar(new_experiment_id) # Wrap up an experiment (store completion time, etc.) protocol_db.complete(new_experiment_id) ``` The meta data can contain the following keys: | Search Type | Description | Default | |----------------------- | ----------- | --------------- | | `purpose` | Purpose of experiment | `'None provided'` | | `project_name` | Project name of experiment | `'default'` | | `exec_resource` | Resource jobs are run on | `'local'` | | `experiment_dir` | Experiment log storage directory | `'experiments'` | | `experiment_type` | Type of experiment to run | `'single'` | | `base_fname` | Main code script to execute | `'main.py'` | | `config_fname` | Config file path of experiment | `'base_config.yaml'` | | `num_seeds` | Number of evaluations seeds | 1 | | `num_total_jobs` | Number of total jobs to run | 1 | | `num_job_batches` | Number of jobs in single batch | 1 | | `num_jobs_per_batch` | Number of sequential job batches | 1 | | `time_per_job` | Expected duration: days-hours-minutes | `'00:01:00'` | | `num_cpus` | Number of CPUs used in job | 1 | | `num_gpus` | Number of GPUs used in job | 0 | Additionally you can synchronize the protocol with a Google Cloud Storage (GCS) bucket by providing `cloud_settings`. In this case also the results stored in `experiment_dir` will be uploaded to the GCS bucket, when you call `protocol.complete()`. ```python # Define GCS settings - requires 'GOOGLE_APPLICATION_CREDENTIALS' env var. cloud_settings = { "project_name": "mle-toolbox", # GCP project name "bucket_name": "mle-protocol", # GCS bucket name "use_protocol_sync": True, # Whether to sync the protocol to GCS "use_results_storage": True, # Whether to sync experiment_dir to GCS } protocol_db = MLEProtocol("mle_protocol.db", cloud_settings, verbose=True) ``` ## The `MLEResource`: Keeping Track of Your Resources 📉 #### On Your Local Machine ```python from mle_monitor import MLEResource # Instantiate local resource and get usage data resource = MLEResource(resource_name="local") resource_data = resource.monitor() ``` #### On a Slurm Cluster ```python resource = MLEResource( resource_name="slurm-cluster", monitor_config={"partitions": ["", ""]}, ) ``` #### On a Grid Engine Cluster ```python resource = MLEResource( resource_name="sge-cluster", monitor_config={"queues": ["", ""]} ) ``` ## The `MLEDashboard`: Dashboard Visualization 🎞️ ```python from mle_monitor import MLEDashboard # Instantiate dashboard with protocol and resource dashboard = MLEDashboard(protocol, resource) # Get a static snapshot of the protocol & resource utilisation printed in console dashboard.snapshot() # Run monitoring in while loop - dashboard dashboard.live() ``` ## Installation ⏳ A PyPI installation is available via: ``` pip install mle-monitor ``` If you want to get the most recent commit, please install directly from the repository: ``` pip install git+https://github.com/mle-infrastructure/mle-monitor.git@main ``` ### Citing the MLE-Infrastructure ✏️ If you use `mle-monitor` in your research, please cite it as follows: ``` @software{mle_infrastructure2021github, author = {Robert Tjarko Lange}, title = {{MLE-Infrastructure}: A Set of Lightweight Tools for Distributed Machine Learning Experimentation}, url = {http://github.com/mle-infrastructure}, year = {2021}, } ``` ## Development 👷 You can run the test suite via `python -m pytest -vv tests/`. If you find a bug or are missing your favourite feature, feel free to create an issue and/or start [contributing](CONTRIBUTING.md) 🤗. ================================================ FILE: examples/base_config.json ================================================ { "train_config": {"lrate": 0.1}, "model_config": {"num_layers": 5}, "log_config": {"time_to_track": ["step_counter"], "what_to_track": ["loss"], "time_to_print": ["step_counter"], "what_to_print": ["loss"], "print_every_k_updates": 10, "overwrite_experiment_dir": 1} } ================================================ FILE: examples/getting_started.ipynb ================================================ { "cells": [ { "attachments": {}, "cell_type": "markdown", "id": "08f9d077-64ed-4dc3-b772-498569e96d2b", "metadata": { "tags": [] }, "source": [ "# `mle-monitor`: Lightweight Resource Monitoring\n", "### Author: [@RobertTLange](https://twitter.com/RobertTLange) [Last Update: March 2023][![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mle-infrastructure/mle-monitor/blob/main/examples/getting_started.ipynb)\n", "\n", "\"Did I already run this experiment before? How many resources are currently available on my cluster?\" If these are common questions you encounter during your daily life as a researcher, then `mle-monitor` is made for you. It provides a lightweight API for tracking your experiments using a pickle protocol database (e.g. for hyperparameter searches and/or multi-configuration/multi-seed runs). Furthermore, it comes with built-in resource monitoring on Slurm/Grid Engine clusters and local machines/servers. Finally, it leverages [`rich`](https://github.com/willmcgugan/rich) in order to provide a terminal dashboard that is updated online with new protocolled experiments and the current state of resource utilization. Here is an example of a dashboard running on a Grid Engine cluster:\n", "\n", "\"drawing\"" ] }, { "cell_type": "code", "execution_count": 1, "id": "b1ff5d35-b29e-4386-b423-9c83ad1ae169", "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", "%config InlineBackend.figure_format = 'retina'\n", "\n", "try:\n", " import mle_monitor\n", "except:\n", " !pip install -q mle-monitor\n", " import mle_monitor" ] }, { "cell_type": "markdown", "id": "9cf5bc30-a187-4936-a57c-3023ff6b3f2a", "metadata": {}, "source": [ "`mle-monitor` comes with three core functionalities:\n", "\n", "- **`MLEProtocol`**: A composable protocol database API for ML experiments.\n", "- **`MLEResource`**: A tool for obtaining server/cluster usage statistics.\n", "- **`MLEDashboard`**: A dashboard visualizing resource usage & experiment protocol.\n", "\n", "\"drawing\"\n", "\n", "Finally, `mle-monitor` is part of the [`mle-infrastructure`](https://github.com/mle-infrastructure) and comes with a set of handy built-in synergies. We will wrap-up by outlining a full workflow of using the protocol together with a random search experiment using [`mle-hyperopt`](https://github.com/mle-infrastructure/mle-hyperopt), [`mle-scheduler`](https://github.com/mle-infrastructure/mle-monitor) and [`mle-logging`](https://github.com/mle-infrastructure/mle-logging)." ] }, { "cell_type": "code", "execution_count": 2, "id": "a66df14c-b920-4926-a8c1-37c42c63e7f2", "metadata": {}, "outputs": [], "source": [ "# Check if code is run in Colab: If so -- download configs from repo\n", "try:\n", " import google.colab\n", " IN_COLAB = True\n", " !wget -q https://raw.githubusercontent.com/mle-infrastructure/mle-monitor/main/examples/train.py\n", " !wget -q https://raw.githubusercontent.com/mle-infrastructure/mle-monitor/main/examples/base_config.json\n", "except:\n", " IN_COLAB = False" ] }, { "cell_type": "markdown", "id": "255b0522-6c4b-440d-8dca-345ba9e8a461", "metadata": {}, "source": [ "# Experiment Management with `MLEProtocol` 📝" ] }, { "cell_type": "code", "execution_count": 3, "id": "44a221b5-758b-491b-b206-65c3bddeeddb", "metadata": {}, "outputs": [], "source": [ "from mle_monitor import MLEProtocol\n", "\n", "# Load the protocol from a local file (create new if it doesn't exist yet)\n", "protocol = MLEProtocol(protocol_fname=\"mle_protocol.db\", verbose=True)" ] }, { "cell_type": "markdown", "id": "ef2d1797-0251-495f-8976-47a24756e79c", "metadata": {}, "source": [ "In order to add a new experiment to the protocol database you have to provide a dictionary containing the experiment meta data:\n", "\n", "| Search Type | Description | Default |\n", "|----------------------- | ----------- | --------------- |\n", "| `purpose` | Purpose of experiment | `'None provided'` |\n", "| `project_name` | Project name of experiment | `'default'` |\n", "| `exec_resource` | Resource jobs are run on | `'local'` |\n", "| `experiment_dir` | Experiment log storage directory | `'experiments'` |\n", "| `experiment_type` | Type of experiment to run | `'single'` |\n", "| `base_fname` | Main code script to execute | `'main.py'` |\n", "| `config_fname` | Config file path of experiment | `'base_config.yaml'` |\n", "| `num_seeds` | Number of evaluations seeds | 1 |\n", "| `num_total_jobs` | Number of total jobs to run | 1 |\n", "| `num_job_batches` | Number of jobs in single batch | 1 |\n", "| `num_jobs_per_batch` | Number of sequential job batches | 1 |\n", "| `time_per_job` | Expected duration: days-hours-minutes | `'00:01:00'` |\n", "| `num_cpus` | Number of CPUs used in job | 1 |\n", "| `num_gpus` | Number of GPUs used in job | 0 |" ] }, { "cell_type": "code", "execution_count": 4, "id": "d65bcced-fa6f-4865-939f-b8d35203b05f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
[15:19:55] INFO     Added experiment 1 to protocol.                       mle_protocol.py:170\n",
       "
\n" ], "text/plain": [ "\u001b[2;36m[15:19:55]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Added experiment \u001b[1;36m1\u001b[0m to protocol. \u001b]8;id=600704;file:///Users/rob/anaconda3/envs/mle-toolbox/lib/python3.9/site-packages/mle_monitor/mle_protocol.py\u001b\\\u001b[2mmle_protocol.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=954284;file:///Users/rob/anaconda3/envs/mle-toolbox/lib/python3.9/site-packages/mle_monitor/mle_protocol.py#170\u001b\\\u001b[2m170\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "meta_data = {\n", " \"purpose\": \"Test Protocol\", # Purpose of experiment\n", " \"project_name\": \"MNIST\", # Project name of experiment\n", " \"exec_resource\": \"local\", # Resource jobs are run on\n", " \"experiment_dir\": \"log_dir\", # Experiment log storage directory\n", " \"experiment_type\": \"hyperparameter-search\", # Type of experiment to run\n", " \"base_fname\": \"train.py\", # Main code script to execute\n", " \"config_fname\": \"base_config.json\", # Config file path of experiment\n", " \"num_seeds\": 5, # Number of evaluations seeds\n", " \"num_total_jobs\": 10, # Number of total jobs to run\n", " \"num_jobs_per_batch\": 5, # Number of jobs in single batch\n", " \"num_job_batches\": 2, # Number of sequential job batches\n", " \"time_per_job\": \"00:05:00\", # Expected duration: days-hours-minutes\n", " \"num_cpus\": 2, # Number of CPUs used in job\n", " \"num_gpus\": 1, # Number of GPUs used in job\n", " }\n", "e_id = protocol.add(meta_data, save=False)" ] }, { "cell_type": "markdown", "id": "be53c0dd-f83f-4935-8d06-926a2a93088c", "metadata": {}, "source": [ "Adding the experiment will load in the configuration file (either `.json` or `.yaml`) and set the experiment status to \"running\". You can then always retrieve the provided information using `protocol.get(e_id)`:" ] }, { "cell_type": "code", "execution_count": 5, "id": "76d04dd4-5e01-45c3-b5e6-86caf6521315", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'purpose': 'Test Protocol',\n", " 'project_name': 'MNIST',\n", " 'exec_resource': 'local',\n", " 'experiment_dir': 'log_dir',\n", " 'experiment_type': 'hyperparameter-search',\n", " 'base_fname': 'train.py',\n", " 'config_fname': 'base_config.json',\n", " 'num_seeds': 5,\n", " 'num_total_jobs': 10,\n", " 'num_jobs_per_batch': 5,\n", " 'num_job_batches': 2,\n", " 'time_per_job': '00:05:00',\n", " 'num_cpus': 2,\n", " 'num_gpus': 1,\n", " 'git_hash': 'ef86aa6343db21998ba58942e3c5e5124d3463c1',\n", " 'loaded_config': [{'train_config': {'lrate': 0.1},\n", " 'model_config': {'num_layers': 5},\n", " 'log_config': {'time_to_track': ['step_counter'],\n", " 'what_to_track': ['loss'],\n", " 'time_to_print': ['step_counter'],\n", " 'what_to_print': ['loss'],\n", " 'print_every_k_updates': 10,\n", " 'overwrite_experiment_dir': 1}}],\n", " 'e-hash': '1795b93d0ded1d5ab9c5a63243abe649',\n", " 'retrieved_results': False,\n", " 'stored_in_cloud': False,\n", " 'report_generated': False,\n", " 'job_status': 'running',\n", " 'completed_jobs': 0,\n", " 'start_time': '03/08/23 15:19',\n", " 'duration': '0:10:00',\n", " 'stop_time': '03/09/23 01:19'}" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "protocol.get(e_id)" ] }, { "cell_type": "markdown", "id": "683161f4-aa0e-467e-89fa-714f739815e4", "metadata": {}, "source": [ "You can also always print a summary snapshot of the last experiments using `protocol.summary()`. By providing the boolean option `full`, you also print the resources used in an experiment:" ] }, { "cell_type": "code", "execution_count": 6, "id": "e13d7d30-8d6b-49d0-b3c0-427654a19383", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
                                                                                             \n",
       "  🔖   🆔     🗓     Project   Purpose         Type               CPU   GPU                 \n",
       " ────────────────────────────────────────────────────────────────────────────                \n",
       "     1    03/08   MNIST     Test Protocol   search   Local   5    2     1                  \n",
       "                                                                                             \n",
       "
\n" ], "text/plain": [ "\u001b[34m \u001b[0m \n", "\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m🔖\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m🆔\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m 🗓 \u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34mProject\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34mPurpose \u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34mType \u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;33m▶\u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;33m♻\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34mCPU\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34mGPU\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m \n", "\u001b[34m ──────────────────────────────────────────────────────────────────────────── \u001b[0m \n", "\u001b[34m \u001b[0m \u001b[35m⠙ \u001b[0m \u001b[34m \u001b[0m 1 \u001b[34m \u001b[0m 03/08 \u001b[34m \u001b[0m MNIST \u001b[34m \u001b[0m Test Protocol \u001b[34m \u001b[0m search \u001b[34m \u001b[0m Local \u001b[34m \u001b[0m 5 \u001b[34m \u001b[0m 2 \u001b[34m \u001b[0m 1 \u001b[34m \u001b[0m \n", "\u001b[34m \u001b[0m \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
                                                                                             \n",
       "                                                                               ⏳ Completed  \n",
       "  🔖   🆔     🗓     Project   Purpose         Type               CPU   GPU      Jobs      \n",
       " ─────────────────────────────────────────────────────────────────────────────────────────── \n",
       "     1    03/08   MNIST     Test Protocol   search   Local   5    2     1     0 /10    0%  \n",
       "                                                                                             \n",
       "
\n" ], "text/plain": [ "\u001b[34m \u001b[0m\n", "\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m⏳ Completed\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\n", "\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m🔖\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m🆔\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m 🗓 \u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34mProject\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34mPurpose \u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34mType \u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;33m▶\u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;33m♻\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34mCPU\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34mGPU\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m Jobs \u001b[0m\u001b[1;33m✔\u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\n", "\u001b[34m ─────────────────────────────────────────────────────────────────────────────────────────── \u001b[0m\n", "\u001b[34m \u001b[0m \u001b[35m⠙ \u001b[0m \u001b[34m \u001b[0m 1 \u001b[34m \u001b[0m 03/08 \u001b[34m \u001b[0m MNIST \u001b[34m \u001b[0m Test Protocol \u001b[34m \u001b[0m search \u001b[34m \u001b[0m Local \u001b[34m \u001b[0m 5 \u001b[34m \u001b[0m 2 \u001b[34m \u001b[0m 1 \u001b[34m \u001b[0m \u001b[37m 0 /10 \u001b[0m \u001b[35m 0%\u001b[0m \u001b[34m \u001b[0m\n", "\u001b[34m \u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Print a summary of the last experiments\n", "sub_df = protocol.summary()\n", "\n", "# ... and a more detailed version\n", "sub_df = protocol.summary(full=True)" ] }, { "cell_type": "markdown", "id": "0d2c9fe6-c040-4eaf-b477-bad3df93e4e1", "metadata": {}, "source": [ "If you want to ad-hoc change any of the stored attributes of an experiment, you can do so using the `update` method. Furthermore, you can change the experiment status using `abort` or `complete`:" ] }, { "cell_type": "code", "execution_count": 7, "id": "a6ab5f12-0c06-494f-ab77-f7ea37262d0d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
                                                                                             \n",
       "  🔖   🆔     🗓     Project   Purpose         Type               CPU   GPU                 \n",
       " ────────────────────────────────────────────────────────────────────────────                \n",
       "      1    03/08   MNIST     Test Protocol   search   Slurm   5    2     1                  \n",
       "                                                                                             \n",
       "
\n" ], "text/plain": [ "\u001b[34m \u001b[0m \n", "\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m🔖\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m🆔\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m 🗓 \u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34mProject\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34mPurpose \u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34mType \u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;33m▶\u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;33m♻\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34mCPU\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34mGPU\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m \n", "\u001b[34m ──────────────────────────────────────────────────────────────────────────── \u001b[0m \n", "\u001b[34m \u001b[0m \u001b[31m✖\u001b[0m \u001b[34m \u001b[0m 1 \u001b[34m \u001b[0m 03/08 \u001b[34m \u001b[0m MNIST \u001b[34m \u001b[0m Test Protocol \u001b[34m \u001b[0m search \u001b[34m \u001b[0m Slurm \u001b[34m \u001b[0m 5 \u001b[34m \u001b[0m 2 \u001b[34m \u001b[0m 1 \u001b[34m \u001b[0m \n", "\u001b[34m \u001b[0m \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'aborted'" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Update some element in the database\n", "protocol.update(e_id, \"exec_resource\", \"slurm-cluster\", save=False)\n", "\n", "# Abort the experiment - changes status\n", "protocol.abort(e_id, save=False)\n", "sub_df = protocol.summary()\n", "\n", "# Get the status of the experiment\n", "protocol.status(e_id)" ] }, { "cell_type": "markdown", "id": "03d2577e-d4d0-484c-8a9a-832e44a6e10e", "metadata": {}, "source": [ "If you would like to get a summary of all reported experiments, the last experiment and its resource requirements, `protocol.monitor()` does so:" ] }, { "cell_type": "code", "execution_count": 8, "id": "25cb30e1-a36f-4307-9c6a-65a3f5c864d2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'total': '1',\n", " 'run': '0',\n", " 'done': '0',\n", " 'aborted': '1',\n", " 'sge': '0',\n", " 'slurm': '1',\n", " 'gcp': '0',\n", " 'local': '0',\n", " 'report_gen': '0',\n", " 'gcs_stored': '0',\n", " 'retrieved': '0'}" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get the monitoring data - used later in dashboard\n", "protocol_data = protocol.monitor()\n", "protocol_data[\"total_data\"]" ] }, { "cell_type": "markdown", "id": "b2338033-85fa-4cbd-9d09-e60395a5aef4", "metadata": {}, "source": [ "Finally, you can also store other data specific to an experiment using an additional dictionary of data as follows:" ] }, { "cell_type": "code", "execution_count": 9, "id": "046554ec-c8dd-4660-a283-b8ef464d387c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
[15:20:05] INFO     Added experiment 2 to protocol.                       mle_protocol.py:170\n",
       "
\n" ], "text/plain": [ "\u001b[2;36m[15:20:05]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Added experiment \u001b[1;36m2\u001b[0m to protocol. \u001b]8;id=818970;file:///Users/rob/anaconda3/envs/mle-toolbox/lib/python3.9/site-packages/mle_monitor/mle_protocol.py\u001b\\\u001b[2mmle_protocol.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=795665;file:///Users/rob/anaconda3/envs/mle-toolbox/lib/python3.9/site-packages/mle_monitor/mle_protocol.py#170\u001b\\\u001b[2m170\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "{'lrate': 0.0003}" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "extra_data = {\"extra_config\": {\"lrate\": 3e-04}}\n", "e_id = protocol.add(meta_data, extra_data, save=False)\n", "protocol.get(e_id)[\"extra_config\"]" ] }, { "cell_type": "markdown", "id": "9ffe6828-8077-4219-9cea-9f81fb0ce10b", "metadata": {}, "source": [ "## Syncing your Protocol DB with a GCS Bucket" ] }, { "cell_type": "markdown", "id": "7384b977-b9b3-4482-afec-dccb899ff598", "metadata": {}, "source": [ "If you would like to keep a remote copy of your protocol, you can also automatically sync your protocol database with a Google Cloud Storage (GCS) bucket. This is especially useful when running experiments on multiple resource and will require you to have created a GCP project and a GCS bucket. Furthermore you will have to provide you `.json` authentication key path. If you don't have one yet, have a look [here](https://cloud.google.com/docs/authentication/getting-started). Alternatively, just make sure that the environment variable `GOOGLE_APPLICATION_CREDENTIALS` is set to the right path." ] }, { "cell_type": "code", "execution_count": 10, "id": "63324b07-e05f-4ed6-9039-f57e26fbb124", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
[15:38:01] INFO     No DB found in GCloud Storage - mle_protocol.db            gcs_sync.py:39\n",
       "
\n" ], "text/plain": [ "\u001b[2;36m[15:38:01]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m No DB found in GCloud Storage - mle_protocol.db \u001b]8;id=165072;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/protocol/gcs_sync.py\u001b\\\u001b[2mgcs_sync.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=723900;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/protocol/gcs_sync.py#39\u001b\\\u001b[2m39\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
           INFO     New DB will be created - mle-toolbox/mle-protocol          gcs_sync.py:40\n",
       "
\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m New DB will be created - mle-toolbox/mle-protocol \u001b]8;id=436921;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/protocol/gcs_sync.py\u001b\\\u001b[2mgcs_sync.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=297090;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/protocol/gcs_sync.py#40\u001b\\\u001b[2m40\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
           INFO     Pulled protocol from GCS bucket: mle-protocol.        mle_protocol.py:379\n",
       "
\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Pulled protocol from GCS bucket: mle-protocol. \u001b]8;id=793270;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/mle_protocol.py\u001b\\\u001b[2mmle_protocol.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=283709;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/mle_protocol.py#379\u001b\\\u001b[2m379\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Sync your protocol with a GCS bucket\n", "cloud_settings = {\n", " \"project_name\": \"mle-toolbox\", # Name of your GCP project\n", " \"bucket_name\": \"mle-protocol\", # Name of your GCS bucket\n", " \"protocol_fname\": \"mle_protocol.db\", # Name of DB file in GCS bucket\n", " \"use_protocol_sync\": True, # Whether to sync the protocol\n", " \"use_results_storage\": False # Whether to upload zipped dir at completion\n", "}\n", "protocol = MLEProtocol(protocol_fname=\"mle_protocol.db\",\n", " cloud_settings=cloud_settings,\n", " verbose=True)" ] }, { "cell_type": "code", "execution_count": 11, "id": "14065522-5cda-485a-95d9-be9109c92004", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
[15:38:06] INFO     Added experiment 1 to protocol.                       mle_protocol.py:162\n",
       "
\n" ], "text/plain": [ "\u001b[2;36m[15:38:06]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Added experiment \u001b[1;36m1\u001b[0m to protocol. \u001b]8;id=53082;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/mle_protocol.py\u001b\\\u001b[2mmle_protocol.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=761624;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/mle_protocol.py#162\u001b\\\u001b[2m162\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
           INFO     Locally stored protocol: mle_protocol.db               mle_protocol.py:91\n",
       "
\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Locally stored protocol: mle_protocol.db \u001b]8;id=284160;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/mle_protocol.py\u001b\\\u001b[2mmle_protocol.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=467582;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/mle_protocol.py#91\u001b\\\u001b[2m91\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
[15:38:07] INFO     Send to GCloud Storage - mle_protocol.db                   gcs_sync.py:70\n",
       "
\n" ], "text/plain": [ "\u001b[2;36m[15:38:07]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Send to GCloud Storage - mle_protocol.db \u001b]8;id=904611;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/protocol/gcs_sync.py\u001b\\\u001b[2mgcs_sync.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=895801;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/protocol/gcs_sync.py#70\u001b\\\u001b[2m70\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
           INFO     Send protocol to GCS bucket: mle-protocol.            mle_protocol.py:364\n",
       "
\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Send protocol to GCS bucket: mle-protocol. \u001b]8;id=688284;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/mle_protocol.py\u001b\\\u001b[2mmle_protocol.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=353419;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/mle_protocol.py#364\u001b\\\u001b[2m364\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
           INFO     GCS synced protocol: mle_protocol.db                   mle_protocol.py:97\n",
       "
\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m GCS synced protocol: mle_protocol.db \u001b]8;id=494795;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/mle_protocol.py\u001b\\\u001b[2mmle_protocol.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=925877;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/mle_protocol.py#97\u001b\\\u001b[2m97\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "e_id = protocol.add(meta_data)" ] }, { "cell_type": "markdown", "id": "8aa22e7c-e46f-4be8-8e0e-aeb254ad4932", "metadata": {}, "source": [ "Finally, you can also choose to store the results of an experiment in the GCS bucket. In this case the protocol will upload a zipped version of your created `experiment_dir` to the bucket whenever you call `protocol.complete()`." ] }, { "cell_type": "markdown", "id": "c74100d5-a957-4a7e-ae87-4a4626babdc4", "metadata": {}, "source": [ "# Resource Monitoring with `MLEResource` 📉\n", "\n", "You can monitor your local machine, server or clusters using the `MLEResource`. If you are running this on a Google Colab, make sure to add a GPU accelerator!" ] }, { "cell_type": "code", "execution_count": 10, "id": "fed9e8a7-74fd-4725-9bd6-31106018069d", "metadata": {}, "outputs": [], "source": [ "from mle_monitor import MLEResource\n", "\n", "resource = MLEResource(resource_name=\"local\")\n", "resource_data = resource.monitor()" ] }, { "cell_type": "code", "execution_count": 11, "id": "0c53b8d0-0093-4de4-868b-40f0b0652322", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['pid', 'p_name', 'mem_util', 'cpu_util', 'cmdline', 'total_cpu_util', 'total_mem_util'])" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "resource_data[\"user_data\"].keys()" ] }, { "cell_type": "markdown", "id": "260c0107-21e1-4c5b-8a84-c3cd96a91536", "metadata": {}, "source": [ "You can also monitor slurm or grid engine clusters by providing the queues/partitions to monitor in `monitor_config`:\n", "```python\n", "resource = MLEResource(\n", " resource_name=\"slurm-cluster\",\n", " monitor_config={\"partitions\": [\"\", \"\"]},\n", ")\n", "resource = MLEResource(\n", " resource_name=\"sge-cluster\",\n", " monitor_config={\"queues\": [\"\", \"\"]}\n", ")\n", "```" ] }, { "cell_type": "markdown", "id": "70353a53-f0cb-4eae-8c18-93b7ef07eb14", "metadata": {}, "source": [ "# Dashboard Visualization with `MLEDashboard` 🎞️" ] }, { "cell_type": "code", "execution_count": 14, "id": "6daefdf4-0fab-4031-bc53-fa56bb8d6bb7", "metadata": {}, "outputs": [], "source": [ "from mle_monitor import MLEDashboard\n", "\n", "dashboard = MLEDashboard(protocol, resource)" ] }, { "cell_type": "code", "execution_count": 15, "id": "c65b012e-d8ef-4ef9-b8a3-8ac5098b06a5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
╭───────────────────────────────────────────────────────────────────────────────────────────╮\n",
       "General Settings 💡                               __           Thu Dec 9 15:38:18 2021 ⏰\n",
       "│                           _ __                                                            │\n",
       "│ • GCS Sync Protocol:                  ____ ___  / /__           Author: @RobertTLange 🐦 │\n",
       "│                           ____ ___  ____  ____  (_) /_____                                │\n",
       "│                           _____                                                           │\n",
       "╰───────────────────────────────────────────────────────────────────────────────────────────╯\n",
       "╭─ Local - Util ─╮╭──────────── Experiment Protocol Summary ─────────────╮╭─ Total Number o─╮\n",
       "                                                                                       \n",
       "                   🔖   🆔   🗓    P…   P…            G…                   \n",
       "                                                                      1      0      \n",
       "    ────────                                                                      \n",
       "                                                                        0      0      \n",
       "                  ──────────────────────────────────────────────────     -            \n",
       "                      1    1…   M…   T…    L…   5   2   1           -      0      \n",
       "                                       P…                                              \n",
       "    ────────                                                                           \n",
       "                                                                                       \n",
       "                                                                                       \n",
       "                                                                                       \n",
       "                                                                                       \n",
       "    ────────                                                                           \n",
       "                                                                                       \n",
       "                                                                                       \n",
       "                                                                                       \n",
       "                                                                                    \n",
       "  ────────────                                                        ╰─────────────────╯\n",
       "                                                                   ╭─ Last Experimen─╮\n",
       "                                                                                    \n",
       "                                                                      E-ID   1      \n",
       "                                                                      Type   hyp…   \n",
       "                                                                      Dir.   log…   \n",
       "                                                                      Scr…   tra…   \n",
       "                                                                      Con…   bas…   \n",
       "                                                                      Sta…   Run…   \n",
       "                                                                             🏃     \n",
       "                                                                      Res…   loc…   \n",
       "                                                                                    \n",
       "                                                                                    \n",
       "  ────────────                                                                         \n",
       "                                                                                    \n",
       "                                                                                       \n",
       "                                                                                       \n",
       "                                                                                       \n",
       "                                                                                       \n",
       "                                                                                       \n",
       "                                                                                       \n",
       "                                                                      ╰─────────────────╯\n",
       "                                                                      ╭─ Experiment Com─╮\n",
       "                                                                                       \n",
       "                                                                         Con…   2/5    \n",
       "                                                                         Tot…   10     \n",
       "                                                                         Jobs          \n",
       "                                                                         #      2      \n",
       "                                                                         Bat…          \n",
       "                                                                         Job…   5      \n",
       "                                                                         Tim…   0:0…   \n",
       "                                                                         Sta…   12/…   \n",
       "                                                                         Time   15:…   \n",
       "                                                                         ~      12/…   \n",
       "                                                                         Stop   01:…   \n",
       "                                                                         Time          \n",
       "                                                                         ~      0:1…   \n",
       "                                                                         Dur…          \n",
       "                                                                         ---…   ---…   \n",
       "                                                                              \n",
       "                                                                         Jobs          \n",
       "                                                                                      \n",
       "                                                                                       \n",
       "                                                                                       \n",
       "                                                                                       \n",
       "                                                                                       \n",
       "                                                                                       \n",
       "                                                                                       \n",
       "                                                                                       \n",
       "                ╰──────────────────────────────────────────────────────╯╰─────────────────╯\n",
       "                ╭─ CPU: 0/8T - Memory: ─╮╭─ Protocol Timeline: T─╮╭─ Protocol Timeline: E─╮\n",
       "                     ┌───────────────… ││     ┌───────────────… ││     ┌───────────────… \n",
       "                 1.00┤ ••• % CPU Util. ││ 2.00┤ ••• Total       ││ 1.00┤ ███ SC          \n",
       "                                      ││                      ││ █████████████████    \n",
       "                 0.80┤ ••• % Mem Util. ││ 1.60┤                 ││ 0.80┤ ███ MC          \n",
       "                                     ││                      ││ █████████████████    \n",
       "                 0.60┤                 ││ 1.20┤                 ││ 0.60┤ ███ HS          \n",
       "                 •••              ││                     ││ █████████████████    \n",
       "                 0.40┤•••••••••••••••… ││ 0.80┤                 ││ 0.40┤                 \n",
       "                 0.20┤•••••••••••••••… ││                      ││ █████████████████    \n",
       "                 0.00┤•••••••  •••    ││ 0.40┤                 ││ 0.21┤███████████████… \n",
       "                 •• ••              ││                      ││ █████████████████    \n",
       "                     └┬──────────────… ││ 0.00┤                 ││ 0.01┤███████████████… \n",
       "                      12/05-20:05:03   ││                      ││ █████████████████    \n",
       "                 12/09-15:38:18        ││     └───────────────… ││     └────────┬──────… \n",
       "                                       ││                 12/0… ││  \n",
       "                                       ││ 15:38                 ││                       \n",
       "                                       ││                       ││                       \n",
       "                                       ││                       ││                       \n",
       "                                       ││                       ││                       \n",
       "                                       ││                       ││                       \n",
       "                                       ││                       ││                       \n",
       "                                       ││                       ││                       \n",
       "╰────────────────╯╰───────────────────────╯╰───────────────────────╯╰───────────────────────╯\n",
       "
\n" ], "text/plain": [ "\u001b[37;44m╭───────────────────────────────────────────────────────────────────────────────────────────╮\u001b[0m\n", "\u001b[37;44m│\u001b[0m\u001b[37;44m \u001b[0m\u001b[1;37;44mGeneral Settings 💡\u001b[0m\u001b[37;44m \u001b[0m\u001b[37;44m __ \u001b[0m\u001b[1;37;44mThu Dec 9 15:38:18 2021 ⏰\u001b[0m\u001b[37;44m \u001b[0m\u001b[37;44m│\u001b[0m\n", "\u001b[37;44m│\u001b[0m\u001b[37;44m \u001b[0m\u001b[37;44m \u001b[0m\u001b[37;44m_ __ \u001b[0m\u001b[37;44m \u001b[0m\u001b[37;44m \u001b[0m\u001b[37;44m│\u001b[0m\n", "\u001b[37;44m│\u001b[0m\u001b[37;44m \u001b[0m\u001b[37;44m• GCS Sync Protocol: \u001b[0m\u001b[32;44m✔\u001b[0m\u001b[37;44m \u001b[0m\u001b[37;44m ____ ___ / /__ \u001b[0m\u001b[37;44m Author: \u001b[0m\u001b]8;id=914304;https://twitter.com/RobertTLange\u001b\\\u001b[4;37;44m@RobertTLange\u001b[0m\u001b]8;;\u001b\\\u001b[37;44m 🐦\u001b[0m\u001b[37;44m \u001b[0m\u001b[37;44m│\u001b[0m\n", "\u001b[37;44m│\u001b[0m\u001b[37;44m \u001b[0m\u001b[37;44m \u001b[0m\u001b[37;44m____ ___ ____ ____ (_) /_____ \u001b[0m\u001b[37;44m \u001b[0m\u001b[37;44m \u001b[0m\u001b[37;44m│\u001b[0m\n", "\u001b[37;44m│\u001b[0m\u001b[37;44m \u001b[0m\u001b[37;44m \u001b[0m\u001b[37;44m_____ \u001b[0m\u001b[37;44m \u001b[0m\u001b[37;44m \u001b[0m\u001b[37;44m│\u001b[0m\n", "\u001b[37;44m╰───────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n", "\u001b[31m╭─\u001b[0m\u001b[31m Local - Util \u001b[0m\u001b[31m─╮\u001b[0m\u001b[94m╭─\u001b[0m\u001b[94m─────────── Experiment Protocol Summary ────────────\u001b[0m\u001b[94m─╮\u001b[0m\u001b[33m╭─\u001b[0m\u001b[33m Total Number o\u001b[0m\u001b[33m─╮\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[34m \u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m🔖\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m🆔\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m🗓 \u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34mP…\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34mP…\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m…\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;33m▶\u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;33m♻\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m…\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34mG…\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m…\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;33m…\u001b[0m \u001b[33m \u001b[0m \u001b[33m \u001b[0m \u001b[32m✔\u001b[0m \u001b[33m \u001b[0m \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m…\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m 1 \u001b[33m \u001b[0m \u001b[33m \u001b[0m 0 \u001b[33m \u001b[0m \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m ──────── \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;34m…\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;33m…\u001b[0m \u001b[33m \u001b[0m \u001b[33m \u001b[0m \u001b[1;33m…\u001b[0m \u001b[33m \u001b[0m \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m\u001b[1;34m \u001b[0m\u001b[1;33m✔\u001b[0m\u001b[1;34m \u001b[0m\u001b[34m \u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m 0 \u001b[33m \u001b[0m \u001b[33m \u001b[0m 0 \u001b[33m \u001b[0m \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m\u001b[2m \u001b[0m\u001b[31m \u001b[0m\u001b[2m \u001b[0m\u001b[31m \u001b[0m\u001b[2m \u001b[0m\u001b[31m \u001b[0m\u001b[2m \u001b[0m\u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[34m ────────────────────────────────────────────────── \u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;33m-\u001b[0m \u001b[33m \u001b[0m \u001b[33m \u001b[0m \u001b[1;33m…\u001b[0m \u001b[33m \u001b[0m \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[34m \u001b[0m \u001b[35m⠸ \u001b[0m \u001b[34m \u001b[0m 1 \u001b[34m \u001b[0m 1… \u001b[34m \u001b[0m M… \u001b[34m \u001b[0m T… \u001b[34m \u001b[0m … \u001b[34m \u001b[0m L… \u001b[34m \u001b[0m 5 \u001b[34m \u001b[0m 2 \u001b[34m \u001b[0m 1 \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m - \u001b[33m \u001b[0m \u001b[33m \u001b[0m 0 \u001b[33m \u001b[0m \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m\u001b[2m \u001b[0m\u001b[31m \u001b[0m\u001b[2m \u001b[0m\u001b[31m \u001b[0m\u001b[2m \u001b[0m\u001b[31m \u001b[0m\u001b[2m \u001b[0m\u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m P… \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[34m \u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m ──────── \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[34m \u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m ──────── \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m\u001b[1;31m \u001b[0m\u001b[1;31m…\u001b[0m\u001b[1;31m \u001b[0m\u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m ──────────── \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m╰─────────────────╯\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m … \u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m╭─\u001b[0m\u001b[33m Last Experimen\u001b[0m\u001b[33m─╮\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m … \u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m … \u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;33mE-ID\u001b[0m \u001b[33m \u001b[0m 1 \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m … \u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;33mType\u001b[0m \u001b[33m \u001b[0m hyp… \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m … \u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;33mDir.\u001b[0m \u001b[33m \u001b[0m log… \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m … \u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;33mScr…\u001b[0m \u001b[33m \u001b[0m tra… \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m … \u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;33mCon…\u001b[0m \u001b[33m \u001b[0m bas… \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m … \u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;33mSta…\u001b[0m \u001b[33m \u001b[0m Run… \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m … \u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[33m \u001b[0m 🏃 \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m … \u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;33mRes…\u001b[0m \u001b[33m \u001b[0m loc… \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m … \u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m … \u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m ──────────── \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m \u001b[31m \u001b[0m\u001b[1m \u001b[0m\u001b[1m…\u001b[0m\u001b[1m \u001b[0m\u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m \u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m╰─────────────────╯\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m╭─\u001b[0m\u001b[33m Experiment Com\u001b[0m\u001b[33m─╮\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;33mCon…\u001b[0m \u001b[33m \u001b[0m 2/5 \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;33mTot…\u001b[0m \u001b[33m \u001b[0m 10 \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;33mJobs\u001b[0m \u001b[33m \u001b[0m \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;33m# \u001b[0m \u001b[33m \u001b[0m 2 \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;33mBat…\u001b[0m \u001b[33m \u001b[0m \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;33mJob…\u001b[0m \u001b[33m \u001b[0m 5 \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;33mTim…\u001b[0m \u001b[33m \u001b[0m 0:0… \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;33mSta…\u001b[0m \u001b[33m \u001b[0m 12/… \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;33mTime\u001b[0m \u001b[33m \u001b[0m 15:… \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;33m~ \u001b[0m \u001b[33m \u001b[0m 12/… \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;33mStop\u001b[0m \u001b[33m \u001b[0m 01:… \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;33mTime\u001b[0m \u001b[33m \u001b[0m \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;33m~ \u001b[0m \u001b[33m \u001b[0m 0:1… \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;33mDur…\u001b[0m \u001b[33m \u001b[0m \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;33m---…\u001b[0m \u001b[33m \u001b[0m \u001b[1;33m---…\u001b[0m \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m ⏳ \u001b[33m \u001b[0m \u001b[35m…\u001b[0m \u001b[35m …\u001b[0m \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;33mJobs\u001b[0m \u001b[33m \u001b[0m \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[1;32m✔\u001b[0m \u001b[33m \u001b[0m \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m│\u001b[0m \u001b[94m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[94m╰──────────────────────────────────────────────────────╯\u001b[0m\u001b[33m╰─────────────────╯\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[33m╭─\u001b[0m\u001b[33m CPU: 0/8T - Memory: \u001b[0m\u001b[33m─╮\u001b[0m\u001b[33m╭─\u001b[0m\u001b[33m Protocol Timeline: T\u001b[0m\u001b[33m─╮\u001b[0m\u001b[33m╭─\u001b[0m\u001b[33m Protocol Timeline: E\u001b[0m\u001b[33m─╮\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m ┌───────────────…\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m ┌───────────────…\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m ┌───────────────…\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m1.00┤\u001b[0m\u001b[40m \u001b[0m\u001b[31;40m•••\u001b[0m\u001b[37;40m % CPU Util.\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m2.00┤\u001b[0m\u001b[40m \u001b[0m\u001b[33;40m•••\u001b[0m\u001b[37;40m Total \u001b[0m\u001b[40m \u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m1.00┤\u001b[0m\u001b[40m \u001b[0m\u001b[34;40m███\u001b[0m\u001b[37;40m SC \u001b[0m\u001b[40m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m│\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m│\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33;40m█████████████████\u001b[0m\u001b[37;40m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m0.80┤\u001b[0m\u001b[40m \u001b[0m\u001b[33;40m•••\u001b[0m\u001b[37;40m % Mem Util.\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m1.60┤ \u001b[0m\u001b[40m \u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m0.80┤\u001b[0m\u001b[40m \u001b[0m\u001b[31;40m███\u001b[0m\u001b[37;40m MC \u001b[0m\u001b[40m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[33m│\u001b[0m \u001b[31;40m•\u001b[0m\u001b[40m \u001b[0m\u001b[37;40m│\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m│\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33;40m█████████████████\u001b[0m\u001b[37;40m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m0.60┤ \u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m1.20┤\u001b[0m\u001b[40m \u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m0.60┤\u001b[0m\u001b[40m \u001b[0m\u001b[33;40m███\u001b[0m\u001b[37;40m HS \u001b[0m\u001b[40m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[33m│\u001b[0m \u001b[31;40m•••\u001b[0m\u001b[40m \u001b[0m\u001b[31;40m•\u001b[0m\u001b[40m \u001b[0m\u001b[31;40m•\u001b[0m\u001b[40m \u001b[0m\u001b[31;40m•\u001b[0m\u001b[40m \u001b[0m\u001b[31;40m•\u001b[0m\u001b[40m \u001b[0m\u001b[37;40m│\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33;40m•\u001b[0m\u001b[40m \u001b[0m\u001b[37;40m│\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33;40m█████████████████\u001b[0m\u001b[37;40m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m0.40┤\u001b[0m\u001b[31;40m•••••••••••••••…\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m0.80┤\u001b[0m\u001b[40m \u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m0.40┤ \u001b[0m\u001b[40m \u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m0.20┤\u001b[0m\u001b[33;40m•••••••••••••••…\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m│\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33;40m█████████████████\u001b[0m\u001b[37;40m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m0.00┤\u001b[0m\u001b[31;40m••••\u001b[0m\u001b[33;40m•\u001b[0m\u001b[31;40m•••\u001b[0m\u001b[40m \u001b[0m\u001b[31;40m•••\u001b[0m\u001b[40m \u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m0.40┤\u001b[0m\u001b[40m \u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m0.21┤\u001b[0m\u001b[33;40m███████████████…\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[33m│\u001b[0m \u001b[31;40m••\u001b[0m\u001b[40m \u001b[0m\u001b[31;40m••\u001b[0m\u001b[40m \u001b[0m\u001b[31;40m•\u001b[0m\u001b[40m \u001b[0m\u001b[31;40m•\u001b[0m\u001b[37;40m│\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m│\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33;40m█████████████████\u001b[0m\u001b[37;40m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m └┬──────────────…\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m0.00┤\u001b[0m\u001b[40m \u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m0.01┤\u001b[0m\u001b[33;40m███████████████…\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m 12/05-20:05:03 \u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m│\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33;40m█████████████████\u001b[0m\u001b[37;40m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m12/09-15:38:18 \u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m └───────────────…\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m └────────┬──────…\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m 12/0…\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m …\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[37;40m15:38 \u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[31m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\u001b[33m│\u001b[0m \u001b[33m│\u001b[0m\n", "\u001b[31m╰────────────────╯\u001b[0m\u001b[33m╰───────────────────────╯\u001b[0m\u001b[33m╰───────────────────────╯\u001b[0m\u001b[33m╰───────────────────────╯\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Get a static snapshot of the protocol & resource utilisation\n", "# Note: This will look a lot nicer in your terminal!\n", "dashboard.snapshot()" ] }, { "cell_type": "code", "execution_count": null, "id": "a0ca1ea3-9824-47ca-92ee-f62ed3a7a3de", "metadata": {}, "outputs": [], "source": [ "# Run monitoring in while loop - dashboard\n", "dashboard.live()" ] }, { "cell_type": "markdown", "id": "d63912fe-3c40-455a-921d-4bce4422a896", "metadata": {}, "source": [ "- Add widget animation!/screenshot" ] }, { "cell_type": "markdown", "id": "b8467015-0f7c-4576-95c6-2c698f5e8680", "metadata": {}, "source": [ "# Integration with the MLE-Infrastructure Ecosystem 🔺\n", "## Running a Hyperparameter Search for Multiple Random Seeds" ] }, { "cell_type": "code", "execution_count": 16, "id": "277fcec3-11d3-4a13-81d6-bc261d6a8ebf", "metadata": {}, "outputs": [], "source": [ "try:\n", " from mle_hyperopt import RandomSearch\n", " from mle_scheduler import MLEQueue\n", " from mle_logging import load_meta_log\n", "except:\n", " !pip install -q mle-hyperopt mle-scheduler mle-logging\n", " !pip install --upgrade rich\n", " from mle_hyperopt import RandomSearch\n", " from mle_scheduler import MLEQueue\n", " from mle_logging import load_meta_log" ] }, { "cell_type": "markdown", "id": "1c58b0ec-af23-45a4-95d9-1e54a1040476", "metadata": {}, "source": [ "We again start by adding an experiment to the protocol at launch time." ] }, { "cell_type": "code", "execution_count": 17, "id": "cffe7cdd-de5e-4809-9206-a9e87b76f916", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
[15:38:26] INFO     Added experiment 2 to protocol.                       mle_protocol.py:162\n",
       "
\n" ], "text/plain": [ "\u001b[2;36m[15:38:26]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Added experiment \u001b[1;36m2\u001b[0m to protocol. \u001b]8;id=165623;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/mle_protocol.py\u001b\\\u001b[2mmle_protocol.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=63651;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/mle_protocol.py#162\u001b\\\u001b[2m162\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
           INFO     Locally stored protocol: mle_protocol.db               mle_protocol.py:91\n",
       "
\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Locally stored protocol: mle_protocol.db \u001b]8;id=144531;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/mle_protocol.py\u001b\\\u001b[2mmle_protocol.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=493805;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/mle_protocol.py#91\u001b\\\u001b[2m91\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Load (existing) protocol database and add experiment data\n", "protocol_db = MLEProtocol(\"mle_protocol.db\", verbose=True)\n", "meta_data = {\n", " \"purpose\": \"random search\", # Purpose of experiment\n", " \"project_name\": \"surrogate\", # Project name of experiment\n", " \"exec_resource\": \"local\", # Resource jobs are run on\n", " \"experiment_dir\": \"logs_search\", # Experiment log storage directory\n", " \"experiment_type\": \"hyperparameter-search\", # Type of experiment to run\n", " \"base_fname\": \"train.py\", # Main code script to execute\n", " \"config_fname\": \"base_config.json\", # Config file path of experiment\n", " \"num_seeds\": 2, # Number of evaluations seeds\n", " \"num_total_jobs\": 4, # Number of total jobs to run\n", " \"num_jobs_per_batch\": 4, # Number of jobs in single batch\n", " \"num_job_batches\": 1, # Number of sequential job batches\n", " \"time_per_job\": \"00:00:02\", # Expected duration: days-hours-minutes\n", "}\n", "new_experiment_id = protocol_db.add(meta_data)" ] }, { "cell_type": "markdown", "id": "8ace6804-6f2b-457a-9f6d-4dc78ed912e0", "metadata": {}, "source": [ "Afterwards, we leverage `mle-hyperopt` to instantiate a random search strategy with its parameter space. We then ask for two configurations and store them as `.yaml` files in our working directory:" ] }, { "cell_type": "code", "execution_count": 18, "id": "7cd0cf2e-dce9-4e5f-8270-67ffa56809df", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
                    MLE-Hyperopt Random Search Hyperspace 🚀                    \n",
       "                                                                                \n",
       "      🌻 Variable   Type          Search Range ↔                                \n",
       "     ──────────────────────────────────────────────────────────────────────     \n",
       "      arch          categorical   ['mlp', 'cnn']                                \n",
       "      lrate         real          Begin: 0.1, End: 0.5, Prior: log-uniform      \n",
       "      batch_size    integer       Begin: 1, End: 5, Prior: uniform              \n",
       "                                                                                \n",
       "
\n" ], "text/plain": [ " \u001b[3m MLE-Hyperopt Random Search Hyperspace 🚀 \u001b[0m \n", " \n", " \u001b[1m \u001b[0m\u001b[1m🌻 Variable\u001b[0m\u001b[1m \u001b[0m \u001b[1m \u001b[0m\u001b[1mType \u001b[0m\u001b[1m \u001b[0m \u001b[1;31m \u001b[0m\u001b[1;31mSearch Range ↔ \u001b[0m\u001b[1;31m \u001b[0m \n", " ────────────────────────────────────────────────────────────────────── \n", " arch categorical \u001b[31m \u001b[0m\u001b[31m['mlp', 'cnn'] \u001b[0m\u001b[31m \u001b[0m \n", " lrate real \u001b[31m \u001b[0m\u001b[31mBegin: 0.1, End: 0.5, Prior: log-uniform\u001b[0m\u001b[31m \u001b[0m \n", " batch_size integer \u001b[31m \u001b[0m\u001b[31mBegin: 1, End: 5, Prior: uniform \u001b[0m\u001b[31m \u001b[0m \n", " \n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "[{'arch': 'mlp', 'lrate': 0.360379148648584, 'batch_size': 3},\n", " {'arch': 'cnn', 'lrate': 0.26208630215377515, 'batch_size': 2}]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Instantiate random search class\n", "strategy = RandomSearch(\n", " real={\"lrate\": {\"begin\": 0.1, \"end\": 0.5, \"prior\": \"log-uniform\"}},\n", " integer={\"batch_size\": {\"begin\": 1, \"end\": 5, \"prior\": \"uniform\"}},\n", " categorical={\"arch\": [\"mlp\", \"cnn\"]},\n", " verbose=True,\n", ")\n", "\n", "# Ask for configurations to evaluate & run parallel eval of seeds * configs\n", "configs, config_fnames = strategy.ask(2, store=True)\n", "configs" ] }, { "cell_type": "markdown", "id": "672d70dd-ee77-40c7-a720-e0015c78e5ed", "metadata": {}, "source": [ "Next, we can use a `MLEQueue` from `mle-scheduler` to run our training script `train.py` for our two configurations and two different random seeds. `train.py` implements a simple surrogate training loop, which logs some statistics with the help of `mle-logging`. Afterwards, we merge the resulting logs into a single `meta_log.hdf5` and retrieve the mean (over seeds) test loss score for both configurations." ] }, { "cell_type": "code", "execution_count": 19, "id": "6b700165-75b7-4b7c-a83c-1338c1645c43", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d9861ce83a384229baf7afaf9b94ee79", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Output()" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
[15:38:35] INFO     Locally stored protocol: mle_protocol.db               mle_protocol.py:91\n",
       "
\n" ], "text/plain": [ "\u001b[2;36m[15:38:35]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Locally stored protocol: mle_protocol.db \u001b]8;id=974722;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/mle_protocol.py\u001b\\\u001b[2mmle_protocol.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=774122;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/mle_protocol.py#91\u001b\\\u001b[2m91\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
           INFO     Locally stored protocol: mle_protocol.db               mle_protocol.py:91\n",
       "
\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Locally stored protocol: mle_protocol.db \u001b]8;id=575975;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/mle_protocol.py\u001b\\\u001b[2mmle_protocol.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=875911;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/mle_protocol.py#91\u001b\\\u001b[2m91\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
           INFO     Locally stored protocol: mle_protocol.db               mle_protocol.py:91\n",
       "
\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Locally stored protocol: mle_protocol.db \u001b]8;id=189310;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/mle_protocol.py\u001b\\\u001b[2mmle_protocol.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=640619;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/mle_protocol.py#91\u001b\\\u001b[2m91\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
[15:38:36] INFO     Locally stored protocol: mle_protocol.db               mle_protocol.py:91\n",
       "
\n" ], "text/plain": [ "\u001b[2;36m[15:38:36]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Locally stored protocol: mle_protocol.db \u001b]8;id=706452;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/mle_protocol.py\u001b\\\u001b[2mmle_protocol.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=688773;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/mle_protocol.py#91\u001b\\\u001b[2m91\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
  MLEQueue - local • 4/4 Jobs ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% • 0:00:01 ⌛\n",
       "
\n" ], "text/plain": [ " \u001b[1;34mMLEQueue - local •\u001b[0m 4/4 Jobs \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100% •\u001b[0m \u001b[33m0:00:01\u001b[0m ⌛\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n",
       "
\n" ], "text/plain": [ "\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "queue = MLEQueue(\n", " resource_to_run=\"local\",\n", " job_filename=\"train.py\",\n", " config_filenames=config_fnames,\n", " random_seeds=[1, 2],\n", " experiment_dir=\"logs_search\",\n", " protocol_db=protocol_db,\n", ")\n", "queue.run()\n", "\n", "# Merge logs of random seeds & configs -> load & get final scores\n", "queue.merge_configs(merge_seeds=True)\n", "meta_log = load_meta_log(\"logs_search/meta_log.hdf5\")\n", "test_scores = [meta_log[r].stats.test_loss.mean[-1] for r in queue.mle_run_ids]" ] }, { "cell_type": "markdown", "id": "7d1f1508-d527-43f2-98ba-eff277328684", "metadata": {}, "source": [ "Finally, we update the random search strategy and tell the protocol that the experiment has been completed:" ] }, { "cell_type": "code", "execution_count": 20, "id": "fa05ce77-20e4-473e-a7ff-162b0b1b9e19", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
┏━━━━━━━━━━━━━━━┳━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n",
       "┃ 📥 Total: 2    ID  Obj. 📉  Configuration 🔖 - 12/09/2021 15:38:43        ┃\n",
       "┡━━━━━━━━━━━━━━━╇━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n",
       "│ Best Overall  │ 0  │ 1.193   │ 'arch': 'mlp', 'lrate': 0.360379148648584,    │\n",
       "│               │    │         │ 'batch_size': 3                               │\n",
       "│ Best in Batch │ 0  │ 1.193   │ 'arch': 'mlp', 'lrate': 0.360379148648584,    │\n",
       "│               │    │         │ 'batch_size': 3                               │\n",
       "└───────────────┴────┴─────────┴───────────────────────────────────────────────┘\n",
       "
\n" ], "text/plain": [ "┏━━━━━━━━━━━━━━━┳━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓\n", "┃\u001b[1m \u001b[0m\u001b[1m📥 Total: 2 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mID\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mObj. 📉\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mConfiguration 🔖 - 12/09/2021 15:38:43 \u001b[0m\u001b[1m \u001b[0m┃\n", "┡━━━━━━━━━━━━━━━╇━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩\n", "│\u001b[2m \u001b[0m\u001b[2mBest Overall \u001b[0m\u001b[2m \u001b[0m│ 0 │ 1.193 │ 'arch': 'mlp', 'lrate': 0.360379148648584, │\n", "│ │ │ │ 'batch_size': 3 │\n", "│\u001b[2m \u001b[0m\u001b[2mBest in Batch\u001b[0m\u001b[2m \u001b[0m│ 0 │ 1.193 │ 'arch': 'mlp', 'lrate': 0.360379148648584, │\n", "│ │ │ │ 'batch_size': 3 │\n", "└───────────────┴────┴─────────┴───────────────────────────────────────────────┘\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
[15:38:43] INFO     Locally stored protocol: mle_protocol.db               mle_protocol.py:91\n",
       "
\n" ], "text/plain": [ "\u001b[2;36m[15:38:43]\u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Locally stored protocol: mle_protocol.db \u001b]8;id=388358;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/mle_protocol.py\u001b\\\u001b[2mmle_protocol.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=128964;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/mle_protocol.py#91\u001b\\\u001b[2m91\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
           INFO     Updated protocol - COMPLETED: 2                       mle_protocol.py:253\n",
       "
\n" ], "text/plain": [ "\u001b[2;36m \u001b[0m\u001b[2;36m \u001b[0m\u001b[34mINFO \u001b[0m Updated protocol - COMPLETED: \u001b[1;36m2\u001b[0m \u001b]8;id=549681;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/mle_protocol.py\u001b\\\u001b[2mmle_protocol.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=588184;file:///Users/rob/Dropbox/core-code/mle-infrastructure/mle-monitor/mle_monitor/mle_protocol.py#253\u001b\\\u001b[2m253\u001b[0m\u001b]8;;\u001b\\\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Update the hyperparameter search strategy\n", "strategy.tell(configs, test_scores)\n", "\n", "# Wrap up experiment (store completion time, etc.)\n", "protocol_db.complete(new_experiment_id)" ] }, { "cell_type": "markdown", "id": "80f0479f-ca08-41d8-a696-2517a7bc7a23", "metadata": {}, "source": [ "Give it a try and let me know what you think! If you find a bug or are missing your favourite feature, feel free to contact me [@RobertTLange](https://twitter.com/RobertTLange) or create an issue!" ] } ], "metadata": { "kernelspec": { "display_name": "mle-toolbox", "language": "python", "name": "mle-toolbox" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 } ================================================ FILE: examples/protocol_gcs.py ================================================ # Import of helpers for protocoling experiments import os from mle_monitor import MLEProtocol def run_protocol_gcs(): """Protocol an experiment and sync it + results in GCS bucket.""" # Define GCS settings - requires 'GOOGLE_APPLICATION_CREDENTIALS' env var. cloud_settings = { "project_name": "mle-toolbox", # GCP project name "bucket_name": "mle-protocol", # GCS bucket name "use_protocol_sync": True, # Whether to sync the protocol to GCS "use_results_storage": True, # Whether to sync experiment_dir to GCS } protocol_db = MLEProtocol("mle_protocol.db", cloud_settings, verbose=True) # Draft data to store in protocol meta_data = { "purpose": "Test 123", # Purpose of experiment "project_name": "MNIST", # Project name of experiment "exec_resource": "local", # Resource jobs are run on "experiment_dir": "log_dir", # Experiment log storage directory "experiment_type": "hyperparameter-search", # Type of experiment to run "config_fname": "base_config.json", # Config file path of experiment } new_experiment_id = protocol_db.add(meta_data) # ... train your network - or create a placeholder directory + file os.makedirs(meta_data["experiment_dir"]) with open(os.path.join(meta_data["experiment_dir"], "readme.txt"), "w") as f: f.write("This is some example file for the mle-monitor example.") # Store experiment_dir as .zip in GCS bucket protocol_db.complete(new_experiment_id) def retrieve_results(): """Retrieve the stored experiment dir from GCS bucket and unzip it.""" # Define GCS settings - requires 'GOOGLE_APPLICATION_CREDENTIALS' env var. cloud_settings = { "project_name": "mle-toolbox", # GCP project name "bucket_name": "mle-protocol", # GCS bucket name "use_protocol_sync": True, # Whether to sync the protocol to GCS "use_results_storage": True, # Whether to sync experiment_dir to GCS } protocol_db = MLEProtocol("mle_protocol.db", cloud_settings, verbose=True) # Retrieve last stored experiment from GCS bucket -> retrieved_log_dir protocol_db.retrieve(protocol_db.last_experiment_id, "retrieved_log_dir/") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Let's train a network.") parser.add_argument("-retrieve", "--retrieve", action="store_true", default=False) args = vars(parser.parse_args()) if args["retrieve"]: retrieve_results() else: run_protocol_gcs() ================================================ FILE: examples/protocol_local.py ================================================ # Import of helpers for protocoling experiments import time from mle_monitor import MLEProtocol, MLEResource, MLEDashboard def run_protocol_local(): """Protocol an experiment and update the progress status.""" # Load (existing) protocol database and print summary protocol_db = MLEProtocol("mle_protocol.db", verbose=False) protocol_db.summary(tail=10, verbose=True) # Ask whether to abort/delete experiment data - only if previously used! if len(protocol_db) > 0: protocol_db.ask_for_e_id(action="delete") protocol_db.ask_for_e_id(action="abort") # Ask for command line input -> purpose of the experiment purpose = protocol_db.ask_for_purpose() # Draft data to store in protocol & add it to the protocol meta_data = { "purpose": purpose, # Purpose of experiment "project_name": "MNIST", # Project name of experiment "exec_resource": "local", # Resource jobs are run on "experiment_dir": "log_dir", # Experiment log storage directory "experiment_type": "hyperparameter-search", # Type of experiment to run "base_fname": "main.py", # Main code script to execute "config_fname": "base_config.json", # Config file path of experiment "num_seeds": 5, # Number of evaluations seeds "num_total_jobs": 10, # Number of total jobs to run "num_jobs_per_batch": 5, # Number of jobs in single batch "num_job_batches": 2, # Number of sequential job batches "time_per_job": "00:05:00", # Expected duration: days-hours-minutes "num_cpus": 2, # Number of CPUs used in job "num_gpus": 1, # Number of GPUs used in job } new_experiment_id = protocol_db.add(meta_data) # ... train your 10 (pseudo) networks/complete respective jobs for i in range(10): protocol_db.update_progress_bar(new_experiment_id) time.sleep(3) # Wrap up experiment (store completion time, etc.) protocol_db.complete(new_experiment_id) def launch_dashboard(): """Launch a dashboard displaying protocol summary and resource status.""" # Load the protocol & define a resource monitor instance (on local machine) protocol = MLEProtocol("mle_protocol.db") resource = MLEResource(resource_name="local", monitor_config={}) # You can also monitor slurm or grid engine clusters # resource = MLEResource( # resource_name="slurm-cluster", # monitor_config={"partitions": ["partition-1", "partition-2"]}, # ) # resource = MLEResource( # resource_name="sge-cluster", # monitor_config={"queues": ["queue-1", "queue-2"]} # ) dashboard = MLEDashboard(protocol, resource) # Run the dashboard in a while loop dashboard.live() if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Let's train a network.") parser.add_argument("-dash", "--dashboard", action="store_true", default=False) args = vars(parser.parse_args()) if args["dashboard"]: launch_dashboard() else: run_protocol_local() ================================================ FILE: examples/run_infrastructure.py ================================================ from mle_hyperopt import RandomSearch from mle_scheduler import MLEQueue from mle_monitor import MLEProtocol from mle_logging import load_meta_log def main(): """Full mle-infrastructure example: logging, search, queue, protocol.""" # Load (existing) protocol database and add experiment data protocol_db = MLEProtocol("mle_protocol.db") meta_data = { "purpose": "random search", # Purpose of experiment "project_name": "surrogate", # Project name of experiment "exec_resource": "local", # Resource jobs are run on "experiment_dir": "logs_search", # Experiment log storage directory "experiment_type": "hyperparameter-search", # Type of experiment to run "base_fname": "train.py", # Main code script to execute "config_fname": "base_config.json", # Config file path of experiment "num_seeds": 2, # Number of evaluations seeds "num_total_jobs": 4, # Number of total jobs to run "num_jobs_per_batch": 4, # Number of jobs in single batch "num_job_batches": 1, # Number of sequential job batches "time_per_job": "00:00:02", # Expected duration: days-hours-minutes } new_experiment_id = protocol_db.add(meta_data) # Instantiate random search class strategy = RandomSearch( real={"lrate": {"begin": 0.1, "end": 0.5, "prior": "log-uniform"}}, integer={"batch_size": {"begin": 1, "end": 5, "prior": "uniform"}}, categorical={"arch": ["mlp", "cnn"]}, verbose=True, ) # Ask for configurations to evaluate & run parallel eval of seeds * configs configs, config_fnames = strategy.ask(2, store=True) queue = MLEQueue( resource_to_run="local", job_filename="train.py", config_filenames=config_fnames, random_seeds=[1, 2], experiment_dir="logs_search", protocol_db=protocol_db, ) queue.run() # Merge logs of random seeds & configs -> load & get final scores queue.merge_configs(merge_seeds=True) meta_log = load_meta_log("logs_search/meta_log.hdf5") test_scores = [meta_log[r].stats.test_loss.mean[-1] for r in queue.mle_run_ids] # Update the hyperparameter search strategy strategy.tell(configs, test_scores) # Wrap up experiment (store completion time, etc.) protocol_db.complete(new_experiment_id) if __name__ == "__main__": main() ================================================ FILE: examples/train.py ================================================ import argparse import random from mle_logging import MLELogger from mle_logging.utils import load_yaml_config def train_your_net(epoch: int, seed_id: int, lrate: float, batch_size: int, arch: str): """Optimum: lrate=0.2, batch_size=4, arch='conv'.""" f1 = (lrate - 0.2) ** 2 + (batch_size - 4) ** 2 + (0 if arch == "conv" else 10) train_loss = f1 + seed_id * 0.5 test_loss = f1 + seed_id * 0.5 + random.uniform(0, 0.3) return train_loss / epoch, test_loss / epoch def main(experiment_dir: str, config_fname: str, seed_id: int): """Example training 'loop' using MLE-Logging.""" train_config = load_yaml_config(config_fname) log = MLELogger( experiment_dir=experiment_dir, config_fname=config_fname, seed_id=seed_id, time_to_track=["num_epochs"], what_to_track=["train_loss", "test_loss"], ) for epoch in range(1, 11): train_loss, test_loss = train_your_net(epoch, seed_id, **train_config) log.update( {"num_epochs": epoch}, {"train_loss": train_loss, "test_loss": test_loss}, save=True, ) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Let's train a network.") parser.add_argument( "-exp_dir", "--experiment_dir", type=str, default="experiments/" ) parser.add_argument( "-config", "--config_fname", type=str, default="base_config_1.yaml" ) parser.add_argument("-seed", "--seed_id", type=int, default=1) args = vars(parser.parse_args()) main(args["experiment_dir"], args["config_fname"], args["seed_id"]) ================================================ FILE: mle_monitor/__init__.py ================================================ from ._version import __version__ from .mle_protocol import MLEProtocol from .mle_resource import MLEResource from .mle_dashboard import MLEDashboard __all__ = [ "__version__", "MLEProtocol", "MLEResource", "MLEDashboard", ] ================================================ FILE: mle_monitor/_version.py ================================================ __version__ = "0.0.2" ================================================ FILE: mle_monitor/dashboard/__init__.py ================================================ from .layout import layout_dashboard from .update import update_dashboard __all__ = ["layout_dashboard", "update_dashboard"] ================================================ FILE: mle_monitor/dashboard/components/__init__.py ================================================ from .plots import ( make_util_plot, make_protocol_total_plot, make_protocol_daily_plot, ) from .protocol import ( make_protocol, make_total_experiments, make_last_experiment, make_est_completion, ) from .usage import ( make_user_jobs_cluster, make_node_jobs_cluster, make_device_panel_local, make_process_panel_local, ) __all__ = [ "make_util_plot", "make_protocol_total_plot", "make_protocol_daily_plot", "make_protocol", "make_total_experiments", "make_last_experiment", "make_est_completion", "make_user_jobs_cluster", "make_node_jobs_cluster", "make_device_panel_local", "make_process_panel_local", ] ================================================ FILE: mle_monitor/dashboard/components/plots.py ================================================ import numpy as np import plotext as plt from rich.ansi import AnsiDecoder from rich.align import Align from rich.table import Table def make_util_plot(util_hist) -> Align: """Plot curve displaying a CPU usage times series for the cluster.""" x = np.arange(len(util_hist["rel_cpu_util"])).tolist() y = np.array(util_hist["rel_cpu_util"]).tolist() x_mem = np.arange(len(util_hist["rel_mem_util"])).tolist() y_mem = np.array(util_hist["rel_mem_util"]).tolist() # Clear the plot and draw the utilisation lines plt.clear_plot() plt.plot(x, y, marker="dot", color="red", label="% CPU Util.") plt.plot(x_mem, y_mem, marker="dot", color="yellow", label="% Mem Util.") plt.figure.plot_size(42, 9) plt.canvas_color("black") plt.axes_color("black") plt.ticks_color("white") plt.ylim(0, 1) # Get time points start and end of monitored period xticks = [0, len(util_hist["times_hour"]) - 1] xlabels = [ util_hist["times_date"][i][:5] + "-" + util_hist["times_hour"][i] for i in xticks ] plt.xticks(xticks, xlabels) plot_str = plotext_helper() decoder = AnsiDecoder() lines = list(decoder.decode(plot_str)) # Build the rich table graph message = Table.grid() message.add_column() for line in lines: message.add_row(line) return Align.center(message) def make_protocol_total_plot(experiment_hist) -> Align: """Plot curve displaying a memory usage times series for the cluster.""" plt.clear_plot() plt.datetime.set_datetime_form(date_form="%m/%d/%y %H:%M") plt.canvas_color("black") plt.axes_color("black") plt.ticks_color("white") plt.plot_date( experiment_hist["time"], experiment_hist["total_exp"]["all"], marker="dot", color="yellow", label="Total", ) plt.figure.plot_size(42, 9) plot_str = plotext_helper() decoder = AnsiDecoder() lines = list(decoder.decode(plot_str)) # Build the rich table graph message = Table.grid() message.add_column() for line in lines: message.add_row(line) return Align.center(message) def make_protocol_daily_plot(experiment_hist) -> Align: """Plot curve displaying a memory usage times series for the cluster.""" plt.clear_plot() try: day_short = [e[:5] for e in experiment_hist["day"]] # Add empty string for label legend visability plt.stacked_bar( [""] + day_short, [ [0.1] + experiment_hist["day_exp"]["hyperparameter-search"], [0] + experiment_hist["day_exp"]["multiple-configs"], [0] + experiment_hist["day_exp"]["single-config"], ], label=["HS", "MC", "SC"], marker="sd", color=["yellow", "red", "blue"], ) except Exception: pass plt.figure.plot_size(42, 9) plt.canvas_color("black") plt.axes_color("black") plt.ticks_color("white") plot_str = plotext_helper() decoder = AnsiDecoder() lines = list(decoder.decode(plot_str)) # Build the rich table graph message = Table.grid() message.add_column() for line in lines: message.add_row(line) return Align.center(message) def plotext_helper(): """Helper fct. that generates ansi string to plot.""" plt.figure.build() return plt.figure.canvas ================================================ FILE: mle_monitor/dashboard/components/protocol.py ================================================ from rich import box from rich.align import Align from rich.table import Table from rich.text import Text from rich.progress import ( BarColumn, Progress, TextColumn, ) def make_protocol(protocol_table) -> Table: """Generate rich table summarizing experiment protocol summary.""" return Align.center(protocol_table) def make_total_experiments(total_data) -> Align: """Generate rich table summarizing all experiments in protocol db.""" table = Table( show_header=False, show_footer=False, header_style="bold yellow", border_style="yellow", box=box.SIMPLE_HEAD, ) table.add_column() table.add_column() table.add_column() table.add_column() table.add_row( "[b yellow]Total", ":running:", "[green]:heavy_check_mark:", "[red]:heavy_multiplication_x:", ) table.add_row( total_data["total"], total_data["run"], total_data["done"], total_data["aborted"], ) table.add_row( Text.from_markup("[b yellow]SGE"), Text.from_markup("[b yellow]Slurm"), Text.from_markup("[b yellow]GCP"), Text.from_markup("[b yellow]Local"), ) table.add_row( total_data["sge"], total_data["slurm"], total_data["gcp"], total_data["local"] ) table.add_row( Text.from_markup("[b yellow]-"), Text.from_markup("[b yellow]Report"), Text.from_markup("[b yellow]GCS"), Text.from_markup("[b yellow]Sync"), ) table.add_row( "-", total_data["report_gen"], total_data["gcs_stored"], total_data["retrieved"] ) return Align.center(table) def make_last_experiment(last_data) -> Align: """Generate rich table summarizing last scheduled experiment settings.""" table = Table( show_header=False, show_footer=False, box=box.SIMPLE_HEAD, header_style="bold yellow", border_style="yellow", ) table.add_column() table.add_column() table.add_row( Text.from_markup("[b yellow]E-ID"), last_data["e_id"], ) table.add_row( Text.from_markup("[b yellow]Type"), last_data["e_type"], ) table.add_row( Text.from_markup("[b yellow]Dir."), last_data["e_dir"], ) table.add_row( Text.from_markup("[b yellow]Script"), last_data["e_script"], ) table.add_row( Text.from_markup("[b yellow]Config"), last_data["e_config"], ) if last_data["job_status"] == "running": table.add_row( Text.from_markup("[b yellow]Status"), "Running :running:", ) elif last_data["job_status"] == "completed": table.add_row( Text.from_markup("[b yellow]Status"), "Completed [green]:heavy_check_mark:", ) elif last_data["job_status"] == "aborted": table.add_row( Text.from_markup("[b yellow]Status"), "Aborted [red]:heavy_multiplication_x:", ) table.add_row( Text.from_markup("[b yellow]Resource"), last_data["resource"], ) return Align.center(table) def make_est_completion(time_data) -> Align: """Generate rich table summarizing estim. time of experim. completion.""" table = Table( show_header=False, show_footer=False, box=box.SIMPLE_HEAD, header_style="bold yellow", border_style="yellow", ) table.add_column() table.add_column() table.add_row( Text.from_markup("[b yellow]Conf/Seeds"), f"{int(time_data['total_jobs']/time_data['num_seeds'])}/{time_data['num_seeds']}", ) table.add_row( Text.from_markup("[b yellow]Total Jobs"), str(time_data["total_jobs"]) ) table.add_row( Text.from_markup("[b yellow]# Batches"), str(time_data["total_batches"]) ) table.add_row( Text.from_markup("[b yellow]Jobs/Batch"), str(time_data["jobs_per_batch"]) ) table.add_row( Text.from_markup("[b yellow]Time/Batch"), str(time_data["time_per_batch"][1:]), ) table.add_row( Text.from_markup("[b yellow]Start Time"), str(time_data["start_time"]) ) if time_data["job_status"] == "completed": table.add_row( Text.from_markup("[b yellow]Stop Time"), str(time_data["stop_time"]) ) table.add_row( Text.from_markup("[b yellow]Duration"), str(time_data["duration"]), ) else: table.add_row( Text.from_markup("[b yellow]~ Stop Time"), str(time_data["stop_time"]) ) table.add_row( Text.from_markup("[b yellow]~ Duration"), str(time_data["duration"]), ) progress = Progress( TextColumn("{task.completed}/{task.total}", justify="left", style="magenta"), BarColumn(bar_width=10, style="magenta"), TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), auto_refresh=False, ) task = progress.add_task("queue", total=time_data["total_jobs"]) progress.update(task, completed=time_data["completed_jobs"], refresh=True) table.add_row( "[b yellow]-----------", "[b yellow]-----------------", ) table.add_row( Text.from_markup( ":hourglass_flowing_sand: [b yellow]Jobs [green]:heavy_check_mark:" ), progress, ) return Align.center(table) ================================================ FILE: mle_monitor/dashboard/components/usage.py ================================================ from rich import box from rich.align import Align from rich.table import Table from rich.text import Text def make_user_jobs_cluster(user_data: dict) -> Align: """Generate rich table summarizing jobs scheduled by users.""" all_active_users = min(len(user_data["total"]), 13) sum_all = str(sum(user_data["total"])) sum_running = str(sum(user_data["run"])) # sum_login = str(sum(user_data["login"])) sum_wait = str(sum(user_data["wait"])) # Create table with structure with aggregated numbers table = Table( show_header=True, show_footer=True, header_style="bold red", row_styles=["none", "dim"], border_style="red", box=box.SIMPLE, ) table.add_column( "USER", Text.from_markup("[b]Sum", justify="right"), style="white", justify="left", ) table.add_column("ALL", sum_all, justify="center") table.add_column(":running:", sum_running, justify="center") # table.add_column(":astronaut:", sum_login, justify="center") table.add_column(":pause_button:", sum_wait, justify="center") # Add row for each individual user for u_id in range(all_active_users): table.add_row( user_data["user"][u_id], str(user_data["total"][u_id]), str(user_data["run"][u_id]), # str(user_data["login"][u_id]), str(user_data["wait"][u_id]), ) return table def make_node_jobs_cluster(host_data: dict, col_title: str) -> Align: """Generate rich table summarizing jobs running on different nodes.""" all_nodes = len(host_data["total"]) sum_all = str(sum(host_data["total"])) sum_running = str(sum(host_data["run"])) sum_login = str(sum(host_data["login"])) table = Table( show_header=True, show_footer=True, header_style="bold red", row_styles=["none", "dim"], border_style="red", box=box.SIMPLE, ) table.add_column( col_title, Text.from_markup("[b]Sum", justify="right"), style="white", justify="left", ) table.add_column("ALL", sum_all, justify="center") table.add_column(":running:", sum_running, justify="center") table.add_column(":construction_worker:", sum_login, justify="center") # Add row for each individual cluster/queue/partition node for h_id in range(all_nodes): table.add_row( host_data["host_id"][h_id], str(host_data["total"][h_id]), str(host_data["run"][h_id]), str(host_data["login"][h_id]), ) return table def make_device_panel_local(device_data: dict) -> Align: """Generate rich table summarizing jobs running on different nodes.""" sum_all = str(round(sum(device_data["percent_util"]), 1)) t1 = Table( show_header=True, show_footer=True, header_style="bold red", row_styles=["none", "dim"], border_style="red", box=box.SIMPLE, ) t1.add_column( ":computer:", Text.from_markup("[b]Sum", justify="right"), style="white", justify="left", ) t1.add_column("%", sum_all, justify="center") t1.add_column(":computer:", "---", justify="center") t1.add_column("%", "---", justify="center") # Add row for each individual core num_cores = len(device_data["core_id"]) for i in range(int(num_cores / 2)): t1.add_row( "C" + str(device_data["core_id"][i]), str(device_data["percent_util"][i]), "C" + str(device_data["core_id"][int(i + num_cores / 2 - 1)]), str(device_data["percent_util"][int(i + num_cores / 2 - 1)]), ) t2 = Table( show_header=True, show_footer=False, box=box.SIMPLE, border_style="red", header_style="bold red", ) t2.add_column("GPU", style="white", justify="left") t2.add_column(":computer: %", justify="center") t2.add_column(":floppy_disk: %", justify="center") t2.add_column("GB", justify="center") for i in range(len(device_data["gpu_name"])): t2.add_row( str(device_data["gpu_name"][i]), str(device_data["gpu_load"][i]), str(device_data["gpu_mem_util"][i]), str(device_data["gpu_mem_total"][i]), ) table = Table(box=box.SIMPLE_HEAD, show_header=False, show_footer=False) table.add_column() table.add_row(t1) table.add_row(t2) return Align.center(table) def make_process_panel_local(proc_data) -> Align: """Generate rich table summarizing jobs running on different nodes.""" num_procs = len(proc_data["pid"]) t1 = Table( show_header=True, show_footer=True, header_style="bold red", border_style="red", box=box.SIMPLE, ) t1.add_column("PID", "---", style="white", justify="left") t1.add_column("NAME", "---", justify="center") t1.add_column( ":computer: %", str(round(proc_data["total_cpu_util"], 1)), justify="center" ) t1.add_column( ":floppy_disk:", str(round(proc_data["total_mem_util"], 1)), justify="center" ) for i in range(int(num_procs / 2)): t1.add_row( str(proc_data["pid"][i]), str(proc_data["p_name"][i][:6]), str(round(proc_data["cpu_util"][i], 1)), str(round(proc_data["mem_util"][i], 1)), ) for i in range(int(num_procs / 2), num_procs): t1.add_row( str(proc_data["pid"][i]), str(proc_data["p_name"][i][:6]), str(round(proc_data["cpu_util"][i], 1)), str(round(proc_data["mem_util"][i], 1)), ) return t1 def make_help_commands() -> Align: """Generate rich table summarizing core toolbox subcommands.""" table = Table( show_header=True, show_footer=False, border_style="white", header_style="bold magenta", box=box.SIMPLE_HEAD, ) table.add_column("Command", style="white", justify="left") table.add_column( "Function", ) table.add_row( "mle run", "[b red] Experiment w. config .yaml", ) table.add_row( "mle retrieve", "[b red] Retrieve from cluster/GCS", ) table.add_row( "mle report", "[b red] Generate results .md report", ) table.add_row( "mle monitor", "[b red] Monitor resource usage", ) table.add_row( "mle init", "[b red] Init creds/default setup", ) return table def make_gcp_util(gcp_data) -> Align: """Generate rich table summarizing running GCP VM machines.""" table = Table( show_header=True, show_footer=False, border_style="white", header_style="bold magenta", box=box.SIMPLE_HEAD, ) table.add_column("Machine", style="white", justify="left") table.add_column("Run", justify="center") table.add_column("Stage", justify="center") table.add_column("Stop", justify="center") for m_id in range(len(gcp_data["experiment_type"])): table.add_row( str(gcp_data["experiment_type"][m_id]), str(gcp_data["run"][m_id]), str(gcp_data["stage"][m_id]), str(gcp_data["stop"][m_id]), ) return table ================================================ FILE: mle_monitor/dashboard/layout.py ================================================ from rich.layout import Layout from rich.panel import Panel from rich.table import Table import datetime as dt def layout_dashboard(resource: str, use_gcs_sync: bool, protocol_fname: str) -> Layout: """Define the MLE-Toolbox `monitor` base dashboard layout.""" layout = Layout(name="root") # Split in three vertical sections: Welcome, core info, help + util plots layout.split_column( Layout(name="header", size=7), Layout(name="main"), ) # Split center into 3 horizontal sections layout["main"].split_row( Layout(name="left", ratio=2), Layout(name="right", ratio=8), ) # Split center into 3 horizontal sections layout["right"].split_column( Layout(name="top-right", ratio=6), Layout(name="bottom-right", ratio=2), ) # Split center into 3 horizontal sections layout["top-right"].split_row( Layout(name="top-right-left", ratio=6), Layout(name="top-right-right", ratio=2), ) # Split center right into total experiments, last experiments, ETA layout["top-right-right"].split_column( Layout(name="top-right-right-1", ratio=3), Layout(name="top-right-right-2", ratio=3), Layout(name="top-right-right-3", ratio=4), ) # Split bottom into toolbox command info and cluster util termplots layout["bottom-right"].split_row( Layout(name="bottom-right-1", ratio=1), Layout(name="bottom-right-2", ratio=1), Layout(name="bottom-right-3", ratio=1), ) # # Fill the header with life! layout["header"].update(Header(resource, use_gcs_sync, protocol_fname)) return layout class Header: """Display header with clock and general toolbox configurations.""" welcome_ascii = """ __ _ __ ____ ___ / /__ ____ ___ ____ ____ (_) /_____ _____ / __ `__ \/ / _ \______/ __ `__ \/ __ \/ __ \/ / __/ __ \/ ___/ / / / / / / / __/_____/ / / / / / /_/ / / / / / /_/ /_/ / / /_/ /_/ /_/_/\___/ /_/ /_/ /_/\____/_/ /_/_/\__/\____/_/ """.splitlines() def __init__(self, resource: str, use_gcs_sync: bool, protocol_fname: str): self.resource = resource self.use_gcs_sync = use_gcs_sync self.protocol_fname = protocol_fname def __rich__(self) -> Panel: grid = Table.grid(expand=True) grid.add_column(justify="left") grid.add_column(justify="left") grid.add_column(justify="right") grid.add_row( "[bold]General Settings :bulb:", Header.welcome_ascii[0], "[bold]" + dt.datetime.now().ctime().replace(" ", " ") + " :alarm_clock:", ) grid.add_row( "\u2022 GCS Sync Protocol: [green]:heavy_check_mark:" if self.use_gcs_sync else "\u2022 GCS Sync Protocol: [red]:heavy_multiplication_x:", Header.welcome_ascii[1], "Author: [u link=https://twitter.com/RobertTLange]@RobertTLange[/] :bird:", ) grid.add_row( "\u2022 GCS Sync Results: [green]:heavy_check_mark:" if self.use_gcs_sync else "\u2022 GCS Sync Results: [red]:heavy_multiplication_x:", Header.welcome_ascii[2], ":point_right: [u link=https://github.com/mle-infrastructure]MLE-Infrastructure[/] :small_red_triangle:", ) grid.add_row( f"\u2022 DB Path: {self.protocol_fname}", Header.welcome_ascii[3], f"Resource: {self.resource.resource_name} :computer:", ) grid.add_row( "[bold]Carpe Diem[/bold] :city_sunrise:", Header.welcome_ascii[4], "[bold]Hi there - You rock! [not italic]:hugging_face:[/]", ) return Panel(grid, style="white on blue") ================================================ FILE: mle_monitor/dashboard/update.py ================================================ from rich.panel import Panel from rich.align import Align from rich.table import Table from .components import ( make_user_jobs_cluster, make_node_jobs_cluster, make_device_panel_local, make_process_panel_local, make_protocol, make_total_experiments, make_last_experiment, make_est_completion, make_util_plot, make_protocol_total_plot, make_protocol_daily_plot, ) def update_dashboard(layout, resource_data, protocol_data, usage_data): """Helper function that fills dashboard with life!""" # Fill the left-main with life! if resource_data["resource_name"] in ["sge-cluster", "slurm-cluster"]: table_user = make_user_jobs_cluster(resource_data["user_data"]) queue_var = ( "PARTITION" if resource_data["resource_name"] == "slurm-cluster" else "QUEUE" ) table_host = make_node_jobs_cluster(resource_data["host_data"], queue_var) table_node = make_node_jobs_cluster(resource_data["node_data"], "NODE") grid = Table.grid(expand=True) grid.add_column() grid.add_row(table_user) grid.add_row(table_host) grid.add_row(table_node) layout["left"].update( Panel( Align.center(grid), border_style="red", title="Jobs by User/Queue/Node", ) ) else: table_device = make_device_panel_local(resource_data["host_data"]) table_process = make_process_panel_local(resource_data["user_data"]) grid = Table.grid() grid.add_column() grid.add_row(table_device) grid.add_row(table_process) layout["left"].update( Panel( Align.center(grid), border_style="red", title="Local - Util by Device/Process", ) ) # Fill the center-main with life! layout["top-right-left"].update( Panel( make_protocol(protocol_data["protocol_table"]), border_style="bright_blue", title="Experiment Protocol Summary", ) ) # Fill the right-main with life! layout["top-right-right-1"].update( Panel( make_total_experiments(protocol_data["total_data"]), border_style="yellow", title="Total Number of Experiment Runs", ) ) layout["top-right-right-2"].update( Panel( make_last_experiment(protocol_data["last_data"]), border_style="yellow", title="Last Experiment Configuration", ) ) layout["top-right-right-3"].update( Panel( make_est_completion(protocol_data["time_data"]), border_style="yellow", title="Experiment Completion Time", ) ) # Fill the footer with life! layout["bottom-right-1"].update( Panel( make_util_plot(usage_data), title=( f"CPU: {int(resource_data['util_data']['cores_util'])}/" f"{int(resource_data['util_data']['cores'])}T" f" - Memory: {int(resource_data['util_data']['mem_util'])}/" f"{int(resource_data['util_data']['mem'])}G" ), border_style="yellow", ), ) layout["bottom-right-2"].update( Panel( make_protocol_total_plot(protocol_data["summary_data"]), title=("Protocol Timeline: Total Experiments"), border_style="yellow", ), ) layout["bottom-right-3"].update( Panel( make_protocol_daily_plot(protocol_data["summary_data"]), title=("Protocol Timeline: Experiments/Day"), border_style="yellow", ) ) return layout ================================================ FILE: mle_monitor/mle_dashboard.py ================================================ import time from rich.live import Live from rich.console import Console from . import MLEProtocol, MLEResource from .utils import Tracker from .dashboard import layout_dashboard, update_dashboard class MLEDashboard(object): def __init__(self, protocol: MLEProtocol, resource: MLEResource): """MLE Resource Dashboard - Rich-based terminal output.""" self.protocol = protocol self.resource = resource self.tracker = Tracker() def snapshot(self): """Get single console output snapshot.""" # Create the layout layout = layout_dashboard( self.resource, self.protocol.use_gcs_protocol_sync, self.protocol.protocol_fname, ) # Retrieve the data resource_data = self.resource.monitor() protocol_data = self.protocol.monitor() usage_data = self.tracker.update(resource_data["util_data"]) # Update the layout and print it layout = update_dashboard( layout, resource_data, protocol_data, usage_data ) Console().print(layout) def live(self, pull_gcs: bool = False): """Run constant monitoring in while loop.""" # Generate the dashboard layout and display first data layout = layout_dashboard( self.resource, self.protocol.use_gcs_protocol_sync, self.protocol.protocol_fname, ) resource_data = self.resource.monitor() protocol_data = self.protocol.monitor() usage_data = self.tracker.update(resource_data["util_data"]) layout = update_dashboard( layout, resource_data, protocol_data, usage_data ) # Start timers for GCS pulling and reloading of local protocol db timer_gcs = time.time() timer_db = time.time() # Run the live updating of the dashboard with Live(console=Console(), screen=True, auto_refresh=True) as live: live.update(layout) while True: try: resource_data = self.resource.monitor() protocol_data = self.protocol.monitor() usage_data = self.tracker.update( resource_data["util_data"], protocol_data["summary_data"], ) layout = update_dashboard( layout, resource_data, protocol_data, usage_data ) # Every 10 seconds reload local database file if time.time() - timer_db > 2: self.protocol.load(pull_gcs=False) timer_db = time.time() # Every 5 minutes pull the newest DB from GCS if pull_gcs: if time.time() - timer_gcs > 300: self.protocol.load() timer_gcs = time.time() except Exception: pass ================================================ FILE: mle_monitor/mle_protocol.py ================================================ from typing import Union, List from datetime import datetime import sys import select import logging from .protocol import ( load_protocol_db, protocol_summary, protocol_experiment, protocol_table, get_monitor_db_data, ) from .utils import setup_logger class MLEProtocol(object): def __init__( self, protocol_fname: str, cloud_settings: Union[dict, None] = None, verbose: bool = False, ): """MLE Protocol DB Instance.""" self.protocol_fname = protocol_fname self.cloud_settings = cloud_settings self.verbose = verbose if self.verbose: self.logger = setup_logger(logging.INFO) else: self.logger = setup_logger(logging.WARNING) # Setup GCS credentials/data if self.cloud_settings is not None: # Don't use sync if cloud_setting dict is empty DotMap if len(self.cloud_settings.keys()) > 0: assert self.cloud_settings["project_name"] is not None assert self.cloud_settings["bucket_name"] is not None self.use_gcs_protocol_sync = self.cloud_settings[ "use_protocol_sync" ] self.use_gcs_protocol_storage = self.cloud_settings[ "use_results_storage" ] if "credentials_path" in self.cloud_settings: # Set the path to the GCP credentials json file & pull recent db from .protocol import set_gcp_credentials set_gcp_credentials(cloud_settings["credentials_path"]) if "protocol_fname" not in self.cloud_settings: self.cloud_settings["protocol_fname"] = self.protocol_fname else: self.use_gcs_protocol_sync = False self.use_gcs_protocol_storage = False else: self.use_gcs_protocol_sync = False self.use_gcs_protocol_storage = False self.load() def load(self, pull_gcs: bool = True): """Load the protocol data from a local pickle file.""" if self.use_gcs_protocol_sync: if pull_gcs: self.accessed_gcs = self.gcs_pull() else: self.accessed_gcs = False else: self.accessed_gcs = False ( self.db, self.experiment_ids, self.last_experiment_id, ) = load_protocol_db(self.protocol_fname) def get( self, experiment_id: Union[str, int, None] = None, var_name: Union[str, None] = None, ): """Retrieve variable from database.""" if experiment_id is None: experiment_id = str(self.last_experiment_id) elif type(experiment_id) == int: experiment_id = str(experiment_id) if var_name is None: return self.db.get(experiment_id) else: return self.db.dget(experiment_id, var_name) def save(self, send_gcs: bool = True): """Dump the protocol db to its pickle file.""" self.db.dump() if self.verbose: self.logger = setup_logger(logging.INFO) else: self.logger = setup_logger(logging.WARNING) self.logger.info(f"Locally stored protocol: {self.protocol_fname}") # Send recent/up-to-date experiment DB to Google Cloud Storage if send_gcs and self.use_gcs_protocol_sync: if self.accessed_gcs: self.gcs_send() self.logger.info(f"GCS synced protocol: {self.protocol_fname}") @property def standard_keys(self): """All required keys in standard dict to supply in `add`.""" return [ "purpose", "project_name", "exec_resource", "experiment_dir", "experiment_type", "base_fname", "config_fname", "num_seeds", "num_total_jobs", "num_job_batches", "num_jobs_per_batch", "time_per_job", "num_cpus", "num_gpus", ] @property def standard_default(self): """All required keys in standard dict to supply in `add`.""" return { "purpose": "None provided", "project_name": "default", "exec_resource": "local", "experiment_dir": "experiments", "experiment_type": "single", "base_fname": "main.py", "config_fname": "base_config.yaml", "num_seeds": 1, "num_total_jobs": 1, "num_job_batches": 1, "num_jobs_per_batch": 1, "time_per_job": "00:01:00", "num_cpus": 1, "num_gpus": 0, } def add( self, standard: dict, extra: Union[dict, None] = None, save: bool = True, send_gcs: bool = True, ): """Add an experiment to the database.""" for k in self.standard_keys: if k not in standard.keys(): standard[k] = self.standard_default[k] assert standard["experiment_type"] in [ "hyperparameter-search", "multiple-configs", "single-config", ] self.db, new_experiment_id = protocol_experiment( self.db, self.last_experiment_id, standard, extra ) self.experiment_ids.append(new_experiment_id) self.last_experiment_id = new_experiment_id self.added_experiment_id = new_experiment_id self.completed_jobs_counter = 0 self.logger.info(f"Added experiment {new_experiment_id} to protocol.") if save: self.save(send_gcs) return new_experiment_id def abort( self, experiment_id: Union[int, str], save: bool = True, send_gcs: bool = True, ): """Abort an experiment - change status in db.""" self.db.dadd(str(experiment_id), ("job_status", "aborted")) if save: self.save(send_gcs) def delete( self, experiment_id: Union[int, str], save: bool = True, send_gcs: bool = True, ): """Delete an experiment - change status in db.""" self.db.drem(str(experiment_id)) self.all_experiment_ids = list(self.db.getall()) try: self.all_experiment_ids.remove("summary") except Exception: pass if len(self.all_experiment_ids) > 0: self.last_experiment_id = int(self.all_experiment_ids[-1]) else: self.last_experiment_id = 0 if save: self.save(send_gcs) def update_progress_bar( self, experiment_id: Union[int, str, None] = None, completed_increment: int = 1, pull_gcs: bool = False, save: bool = True, send_gcs: bool = False, ): """Update progress bar of completed jobs using an integer increment.""" if experiment_id is None: experiment_id = self.added_experiment_id self.completed_jobs_counter += completed_increment try: self.load(pull_gcs) self.update( experiment_id, "completed_jobs", self.completed_jobs_counter, save=save, send_gcs=send_gcs, ) except Exception: pass def complete( self, experiment_id: Union[int, str], report: bool = False, save: bool = True, ): """Set status of an experiment to completed - change status in db.""" self.load() experiment_data = self.get(experiment_id) # Store experiment directory in GCS bucket under hash if self.use_gcs_protocol_storage: from .utils import send_gcloud_zip zip_to_store = experiment_data["e-hash"] + ".zip" experiment_dir = experiment_data["experiment_dir"] send_gcloud_zip( self.cloud_settings, experiment_dir, zip_to_store, True ) self.logger.info(f"Send results to GCS: {zip_to_store}") # Update and send protocol db time_t = datetime.now().strftime("%m/%d/%y %H:%M") stop_time = datetime.strptime(time_t, "%m/%d/%y %H:%M") start_time = datetime.strptime( experiment_data["start_time"], "%m/%d/%y %H:%M" ) duration = str(stop_time - start_time) self.update( experiment_id, var_name=[ "job_status", "stop_time", "duration", "report_generated", "stored_in_gcloud", "completed_jobs", ], var_value=[ "completed", time_t, duration, report, self.use_gcs_protocol_storage, experiment_data["num_total_jobs"], ], save=save, ) self.logger.info(f"Updated protocol - COMPLETED: {experiment_id}") def status(self, experiment_id: Union[int, str]) -> str: """Get the status of an experiment.""" return self.db.dget(str(experiment_id), "job_status") def update( self, experiment_id: Union[int, str], var_name: Union[List[str], str], var_value: Union[list, str, dict], save: bool = True, send_gcs: bool = True, ): """Update the data of an experiment.""" # Update the variable(s) of the experiment if type(var_name) == list: for db_v_id in range(len(var_name)): self.db.dadd( str(experiment_id), (var_name[db_v_id], var_value[db_v_id]) ) else: self.db.dadd(str(experiment_id), (var_name, var_value)) if save: self.save(send_gcs) def summary( self, tail: int = 5, verbose: bool = True, return_table: bool = False, full: bool = False, ): """Print a rich summary table of all experiments in db.""" summary = protocol_summary( self.db, self.experiment_ids, tail, verbose, full ) if return_table: return protocol_table(summary, full) return summary def monitor(self): """Get monitoring data used in dashboard.""" total_data, last_data, time_data = get_monitor_db_data(self) protocol_table = self.summary( tail=50, verbose=False, return_table=True, full=True ) summary_data = self.get("summary") if not summary_data: summary_data = { "time": [], "day": [], "total_exp": { "all": [], "hyperparameter-search": [], "multiple-configs": [], "single-config": [], }, "day_exp": { "all": [], "hyperparameter-search": [], "multiple-configs": [], "single-config": [], }, } return { "total_data": total_data, "last_data": last_data, "time_data": time_data, "summary_data": summary_data, "protocol_table": protocol_table, } def retrieve( self, experiment_id: Union[int, str], local_dir_name: Union[None, str] = None, ): """Retrieve experiment from GCS.""" from .utils import get_gcloud_zip while True: if str(experiment_id) not in self.experiment_ids: time_t = datetime.now().strftime("%m/%d/%Y %I:%M:%S %p") print( time_t, "The experiment you are trying to retrieve does not exist", ) experiment_id = input( time_t + " Which experiment do you want to retrieve?" ) else: break # Update protocol retrieval status of the experiment hash_to_store = self.get(experiment_id, "e-hash") get_gcloud_zip( self.cloud_settings, hash_to_store, experiment_id, local_dir_name ) self.update(experiment_id, "retrieved_results", True) self.logger.info(f"Retrieved results from GCS bucket: {experiment_id}.") def gcs_send(self): """Send the local protocol to a GCS bucket.""" # First store the most recent log from .protocol import send_gcloud_db send_db = send_gcloud_db( self.cloud_settings["project_name"], self.cloud_settings["bucket_name"], self.cloud_settings["protocol_fname"], self.protocol_fname, ) self.logger.info( "Send protocol to GCS bucket:" f" {self.cloud_settings['bucket_name']}." ) return send_db def gcs_pull(self): """Pull the remote protocol from a GCS bucket.""" from .protocol import get_gcloud_db accessed_db = get_gcloud_db( self.cloud_settings["project_name"], self.cloud_settings["bucket_name"], self.cloud_settings["protocol_fname"], self.protocol_fname, ) self.logger.info( "Pulled protocol from GCS bucket:" f" {self.cloud_settings['bucket_name']}." ) return accessed_db def __len__(self) -> int: """Return number of experiments stored in protocol.""" return len(self.experiment_ids) def ask_for_e_id(self, action: str = "delete"): """Ask user if they want to delete/abort previous experiment by id.""" time_t = datetime.now().strftime("%m/%d/%Y %I:%M:%S %p") q_str = "{} Want to {} an experiment? - state its id: [e_id/N]" print(q_str.format(time_t, action), end=" ") sys.stdout.flush() # Loop over experiments to delete until "N" given or timeout after 60 secs while True: i, o, e = select.select([sys.stdin], [], [], 60) if i: e_id = sys.stdin.readline().strip() if e_id == "N": return e_id else: break if e_id not in self.experiment_ids: time_t = datetime.now().strftime("%m/%d/%Y %I:%M:%S %p") print( "\n{} The e_id is not in the protocol db. " "Please try again: [e_id/N]".format(time_t), end=" ", ) sys.stdout.flush() continue # Delete the dictionary in DB corresponding to e-id if action == "delete": self.delete(e_id, save=True) elif action == "abort": self.abort(e_id, save=True) else: return e_id print( "{} Another one? - state the next id: [e_id/N]".format(time_t), end=" ", ) sys.stdout.flush() def ask_for_purpose(self): """Ask user for purpose of experiment.""" # Add purpose of experiment - cmd args or timeout input after 30 secs time_t = datetime.now().strftime("%m/%d/%Y %I:%M:%S %p") print(f"{time_t} Purpose of experiment?", end=" "), sys.stdout.flush() i, o, e = select.select([sys.stdin], [], [], 60) if i: purpose = sys.stdin.readline().strip() else: purpose = "default" return purpose ================================================ FILE: mle_monitor/mle_resource.py ================================================ from typing import Union from .resource import SGEResource, SlurmResource, LocalResource, GCPResource class MLEResource(object): def __init__( self, resource_name: str = "local", monitor_config: Union[dict, None] = None ): """MLE Resource Instance - Get Monitoring Data.""" assert resource_name in ["local", "sge-cluster", "slurm-cluster", "gcp-cloud"] self.resource_name = resource_name self.monitor_config = monitor_config if self.resource_name == "sge-cluster": self.resource = SGEResource(self.monitor_config) elif self.resource_name == "slurm-cluster": self.resource = SlurmResource(self.monitor_config) elif self.resource_name == "gcp-cloud": self.resource = GCPResource(self.monitor_config) else: self.resource = LocalResource(self.monitor_config) def monitor(self): """Get utilization data.""" # Get resource dependent data if self.resource_name == "local": user_data, host_data, util_data = self.resource.monitor() return { "resource_name": self.resource_name, "user_data": user_data, "host_data": host_data, "util_data": util_data, } elif self.resource_name in ["sge-cluster", "slurm-cluster"]: user_data, host_data, util_data, node_data = self.resource.monitor() return { "resource_name": self.resource_name, "user_data": user_data, "host_data": host_data, "util_data": util_data, "node_data": node_data, } else: gcp_data = self.resource.monitor() return gcp_data ================================================ FILE: mle_monitor/protocol/__init__.py ================================================ from .load import load_protocol_db from .tables import protocol_summary, protocol_table from .add import protocol_experiment from .summary import get_monitor_db_data from .gcs_sync import set_gcp_credentials, send_gcloud_db, get_gcloud_db __all__ = [ "load_protocol_db", "protocol_summary", "protocol_table", "protocol_experiment", "get_monitor_db_data", "set_gcp_credentials", "send_gcloud_db", "get_gcloud_db", ] ================================================ FILE: mle_monitor/protocol/add.py ================================================ import os import hashlib import json import datetime as dt from datetime import datetime from typing import Union, Tuple from ..utils import load_json_config, load_yaml_config def protocol_experiment( db, last_experiment_id, standard: dict, extra: Union[dict, None] = None ): """Add the new experiment to the protocol database.""" # Add experiment summary data add_experiment_summary(db, standard["experiment_type"]) # Create a new db experiment entry new_experiment_id = str(last_experiment_id + 1) db.dcreate(new_experiment_id) # Add all standard & extra data points to protocol for k, v in standard.items(): db.dadd(new_experiment_id, (k, v)) if extra is not None: for k, v in extra.items(): db.dadd(new_experiment_id, (k, v)) try: import git except ImportError: print("You need to install `GitPython` to use git hash protocolling.") # Add the git latest commit hash try: g = git.Repo(search_parent_directories=True) git_hash = g.head.object.hexsha except Exception: git_hash = "no-git-repo" db.dadd(new_experiment_id, ("git_hash", git_hash)) # Add the base config - load from given configuration file(s) if type(standard["config_fname"]) == str: all_config_paths = [standard["config_fname"]] elif type(standard["base_train_config"]) == list: all_config_paths = standard["config_fname"] else: all_config_paths = [] loaded_configs = [] for config_path in all_config_paths: _, fext = os.path.splitext(config_path) if fext == ".json": base_config = load_json_config(config_path) elif fext == ".yaml": base_config = load_yaml_config(config_path) else: base_config = {} loaded_configs.append(base_config) db.dadd(new_experiment_id, ("loaded_config", loaded_configs)) # Hash to store results by in GCS bucket (timestamp, config_fname) config_to_hash = { **{"time": datetime.now().strftime("%m/%d/%Y %H:%M:%S")}, **{"config_fname": standard["config_fname"]}, } hash_object = hashlib.md5(json.dumps(config_to_hash).encode("ASCII")) experiment_hash = hash_object.hexdigest() # Set of booleans: previously retrieved, stored in GCS, report generated db.dadd(new_experiment_id, ("e-hash", experiment_hash)) db.dadd(new_experiment_id, ("retrieved_results", False)) db.dadd(new_experiment_id, ("stored_in_cloud", False)) db.dadd(new_experiment_id, ("report_generated", False)) # Set the job status to running & calculate time till completion db.dadd(new_experiment_id, ("job_status", "running")) db.dadd(new_experiment_id, ("completed_jobs", 0)) start_time = datetime.now().strftime("%m/%d/%y %H:%M") est_duration, stop_time = estimate_experiment_duration( start_time, standard["time_per_job"], standard["num_job_batches"] ) db.dadd(new_experiment_id, ("start_time", start_time)) db.dadd(new_experiment_id, ("duration", est_duration)) db.dadd(new_experiment_id, ("stop_time", stop_time)) return db, int(new_experiment_id) def estimate_experiment_duration( start_time: str, time_per_batch: str, total_batches: int ) -> Tuple[str, str]: """Estimate total duration and completion time of experiment.""" days, hours, minutes = time_per_batch.split(":") hours_add, tot_mins = divmod(total_batches * int(minutes), 60) days_add, tot_hours = divmod(total_batches * int(hours) + hours_add, 24) tot_days = total_batches * int(days) + days_add tot_days, tot_hours, tot_mins = ( str(tot_days), str(tot_hours), str(tot_mins), ) if len(tot_days) < 2: tot_days = tot_days if len(tot_hours) < 2: tot_hours = "0" + tot_hours if len(tot_mins) < 2: tot_mins = "0" + tot_mins start_date = dt.datetime.strptime(start_time, "%m/%d/%y %H:%M") end_date = start_date + dt.timedelta( days=int(float(tot_days)), hours=int(float(tot_hours)), minutes=int(float(tot_mins)), ) est_duration = tot_days + ":" + tot_hours + ":" + tot_mins stop_time = end_date.strftime("%m/%d/%y %H:%M") return est_duration, stop_time def add_experiment_summary(db, experiment_type: str): """Update the summary data of the protocol.""" all_experiment_types = [ "hyperparameter-search", "multiple-configs", "single-config", ] start_time = datetime.now().strftime("%m/%d/%y %H:%M") start_day = datetime.now().strftime("%m/%d/%y") if "summary" not in list(db.getall()): db.dcreate("summary") # Create dict that stores timeseries of cumulative experiments by type db.dadd("summary", ("time", [start_time])) total_dict = {"all": [1]} for k in all_experiment_types: total_dict[k] = [0] total_dict[experiment_type][-1] += 1 db.dadd("summary", ("total_exp", total_dict)) # Create dict that stores timeseries of daily experiments by type db.dadd("summary", ("day", [start_day])) day_dict = {"all": [1]} for k in all_experiment_types: day_dict[k] = [0] day_dict[experiment_type][-1] += 1 db.dadd("summary", ("day_exp", day_dict)) else: data = db.get("summary") # Add new experiment to total dict total_dict = data["total_exp"] data["time"].append(start_time) db.dadd("summary", ("time", data["time"])) total_dict["all"].append(total_dict["all"][-1] + 1) for k in all_experiment_types: total_dict[k].append(total_dict[k][-1]) total_dict[experiment_type][-1] += 1 db.dadd("summary", ("total_exp", total_dict)) # Add new experiment to daily dict day_dict = data["day_exp"] try: idx = data["day"].index(start_day) day_dict[experiment_type][idx] += 1 db.dadd("summary", ("day_exp", day_dict)) except Exception: data["day"].append(start_day) db.dadd("summary", ("day", data["day"])) for k in all_experiment_types: day_dict[k].append(0) day_dict[experiment_type][-1] += 1 db.dadd("summary", ("day_exp", day_dict)) return ================================================ FILE: mle_monitor/protocol/gcs_sync.py ================================================ import logging import os from os.path import expanduser from ..utils import setup_logger def set_gcp_credentials(credentials_path: str = ""): if credentials_path != "": os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.expanduser( credentials_path ) def get_gcloud_db( project_name: str, bucket_name: str, gcs_protocol_fname: str, local_protocol_fname: str, number_of_connect_tries: int = 5, ) -> int: """Pull latest experiment database from gcloud storage.""" try: from google.cloud import storage except ImportError: raise ImportError( "You need to install `google-cloud-storage` to use GCP buckets." ) logger = setup_logger() for i in range(number_of_connect_tries): try: # Connect to project and bucket client = storage.Client(project_name) bucket = client.get_bucket(bucket_name, timeout=20) # Download blob to db file blob = bucket.blob(gcs_protocol_fname) with open(expanduser(local_protocol_fname), "wb") as file_obj: blob.download_to_file(file_obj) logger.info(f"Pulled from GCloud Storage - {gcs_protocol_fname}") return 1 except Exception as ex: # Remove empty file - causes error otherwise when trying to load try: os.remove(expanduser(local_protocol_fname)) except Exception: pass if type(ex).__name__ == "NotFound": logger.info( f"No DB found in GCloud Storage - {gcs_protocol_fname}" ) logger.info( f"New DB will be created - {project_name}/{bucket_name}" ) return 1 else: logger.info( f"Attempt {i+1}/{number_of_connect_tries}" " - Failed pulling from GCloud Storage" f" - {type(ex).__name__}" ) # If after 5 pulls no successful connection established - return failure return 0 def send_gcloud_db( project_name: str, bucket_name: str, gcs_protocol_fname: str, local_protocol_fname: str, number_of_connect_tries: int = 5, ): """Send updated database back to gcloud storage.""" try: from google.cloud import storage except ImportError: raise ImportError( "You need to install `google-cloud-storage` to use GCP buckets." ) logger = setup_logger() for i in range(number_of_connect_tries): try: # Connect to project and bucket client = storage.Client(project_name) bucket = client.get_bucket(bucket_name, timeout=20) blob = bucket.blob(gcs_protocol_fname) blob.upload_from_filename(filename=expanduser(local_protocol_fname)) logger.info(f"Send to GCloud Storage - {gcs_protocol_fname}") return 1 except Exception: logger.info( f"Attempt {i+1}/{number_of_connect_tries}" " - Failed sending to GCloud Storage" ) # If after 5 pulls no successful connection established - return failure return 0 ================================================ FILE: mle_monitor/protocol/load.py ================================================ import re import pickledb def load_protocol_db(protocol_fname): """Load local database from config name & reconstruct experiment id.""" # Attempt loading local protocol database - otherwise return clean one db = pickledb.load(protocol_fname, False, sig=False) # Get the most recent experiment id all_experiment_ids = list(db.getall()) try: all_experiment_ids.remove("summary") except: pass def natural_keys(text: str): """Helper function for sorting alpha-numeric strings.""" def atoi(text): return int(text) if text.isdigit() else text return [atoi(c) for c in re.split(r"(\d+)", text)] # Sort experiment ids & get the last one if len(all_experiment_ids) > 0: all_experiment_ids.sort(key=natural_keys) last_experiment_id = int(all_experiment_ids[-1]) else: last_experiment_id = 0 return db, all_experiment_ids, last_experiment_id ================================================ FILE: mle_monitor/protocol/summary.py ================================================ def get_monitor_db_data(db): """Helper to get all data from pickledb database.""" if len(db.experiment_ids) > 0: total_data = get_total_experiments(db, db.experiment_ids) last_data = get_last_experiment(db, db.experiment_ids[-1]) time_data = get_time_experiment(db, db.experiment_ids[-1]) else: total_data = { "total": "0", "run": "0", "done": "0", "aborted": "0", "sge": "0", "slurm": "0", "gcp": "0", "local": "0", "report_gen": "0", "gcs_stored": "0", "retrieved": "0", } last_data = { "e_id": "-", "e_dir": "-", "e_type": "-", "e_script": "-", "e_config": "-", "report_gen": "-", "job_status": "-", "resource": "-", } time_data = { "job_status": "-", "total_jobs": 1, "total_batches": 1, "completed_jobs": 0, "jobs_per_batch": "-", "time_per_batch": "-", "start_time": "-", "stop_time": "-", "duration": "-", "num_seeds": 1, } return total_data, last_data, time_data def get_total_experiments(db, all_experiment_ids): """Get data from db to show in 'total_experiments' panel.""" run, done, aborted, sge, slurm, gcp, local = 0, 0, 0, 0, 0, 0, 0 report_gen, gcs_stored, retrieved = 0, 0, 0 for e_id in all_experiment_ids: status = db.get(e_id, "job_status") # Job status run += status == "running" done += status == "completed" aborted += status not in ["running", "completed"] # Execution resource data resource = db.get(e_id, "exec_resource") sge += resource == "sge-cluster" slurm += resource == "slurm-cluster" gcp += resource == "gcp-cloud" local += resource not in ["sge-cluster", "slurm-cluster", "gcp-cloud"] # Additional data: Report generated, GCS stored, Results retrieved try: report_gen += db.get(e_id, "report_generated") gcs_stored += db.get(e_id, "stored_in_gcloud") retrieved += db.get(e_id, "retrieved_results") except Exception: pass # Return results dictionary results = { "total": str(len(all_experiment_ids)), "run": str(run), "done": str(done), "aborted": str(aborted), "sge": str(sge), "slurm": str(slurm), "gcp": str(gcp), "local": str(local), "report_gen": str(report_gen), "gcs_stored": str(gcs_stored), "retrieved": str(retrieved), } return results def get_time_experiment(db, last_experiment_id): """Get data from db to show in 'time_experiment' panel.""" last_experiment = db.get(last_experiment_id) results = { "num_seeds": last_experiment["num_seeds"], "total_jobs": last_experiment["num_total_jobs"], "completed_jobs": last_experiment["completed_jobs"], "total_batches": last_experiment["num_job_batches"], "jobs_per_batch": last_experiment["num_jobs_per_batch"], "time_per_batch": last_experiment["time_per_job"], "start_time": last_experiment["start_time"], "stop_time": last_experiment["stop_time"], "duration": last_experiment["duration"], "job_status": last_experiment["job_status"], } return results def get_last_experiment(db, last_experiment_id): """Get data from db to show in 'last_experiments' panel.""" last_experiment = db.get(last_experiment_id) results = { "e_id": str(last_experiment_id), "job_status": last_experiment["job_status"], "e_dir": last_experiment["experiment_dir"], "e_type": last_experiment["experiment_type"], "e_script": last_experiment["base_fname"], "e_config": last_experiment["config_fname"], "report_gen": last_experiment["report_generated"], "resource": last_experiment["exec_resource"], } return results ================================================ FILE: mle_monitor/protocol/tables.py ================================================ import pandas as pd from datetime import datetime from rich import box from rich.table import Table from rich.spinner import Spinner from rich.console import Console from rich.align import Align from rich.progress import ( BarColumn, Progress, TextColumn, ) def protocol_summary( db, all_experiment_ids, tail: int = 5, verbose: bool = True, full: bool = False ): """Construct a summary dataframe of previous experiments.""" # Set pandas df format option to print pd.set_option("display.max_columns", 5) pd.set_option("max_colwidth", 30) if len(all_experiment_ids) > 0: purposes, project_names, exp_paths = [], [], [] num_seeds, statuses, start_times, experiment_types = [], [], [], [] resource, num_cpus, num_gpus, total_jobs, completed_jobs = [], [], [], [], [] if tail is None: tail = len(all_experiment_ids) # Loop over experiment ids and extract data to retrieve for int_e_id in all_experiment_ids[-tail:]: e_id = str(int_e_id) purposes.append(db.dget(e_id, "purpose")) project_names.append(db.dget(e_id, "project_name")) exp_paths.append(db.dget(e_id, "experiment_dir")) statuses.append(db.dget(e_id, "job_status")) start_times.append(db.dget(e_id, "start_time")) resource.append(db.dget(e_id, "exec_resource")) num_seeds.append(db.dget(e_id, "num_seeds")) num_cpus.append(db.dget(e_id, "num_cpus")) num_gpus.append(db.dget(e_id, "num_gpus")) experiment_types.append(db.dget(e_id, "experiment_type")) total_jobs.append(db.dget(e_id, "num_total_jobs")) completed_jobs.append(db.dget(e_id, "completed_jobs")) d = { "ID": [str(e_id) for e_id in all_experiment_ids[-tail:]], "Date": start_times, "Project": project_names, "Purpose": purposes, "Experiment Dir": exp_paths, "Status": statuses, "Seeds": num_seeds, "Resource": resource, "CPUs": num_cpus, "GPUs": num_gpus, "Type": experiment_types, "Jobs": total_jobs, "Completed Jobs": completed_jobs, } df = pd.DataFrame(d) df["Date"] = df["Date"].map("{:.5}".format) df["Purpose"] = df["Purpose"].map("{:.30}".format) # Print a nice table overview (no job resources) if verbose: Console().print(Align.left(protocol_table(df, full))) return df else: if verbose: time_t = datetime.now().strftime("%m/%d/%Y %I:%M:%S %p") print(time_t, "No previously recorded experiments") return None def get_progress_bar(total_jobs: int, completed_jobs: int): progress = Progress( TextColumn( "{task.completed:^3.0f}/{task.total:^3.0f}", justify="left", style="white" ), BarColumn(bar_width=10, style="red"), TextColumn("[progress.percentage]{task.percentage:>3.0f}%", style="white"), auto_refresh=False, ) task = progress.add_task("queue", total=total_jobs) progress.update(task, completed=completed_jobs, refresh=True) return progress def protocol_table(df, full: bool = True): """Generate pretty table of experiment protocol db - preselected db.""" table = Table(show_header=True, show_footer=False, header_style="bold blue") table.add_column(":bookmark:", justify="center") table.add_column(":id:", justify="center") table.add_column(":spiral_calendar:", justify="center") table.add_column("Project") table.add_column("Purpose") table.add_column("Type") table.add_column("[yellow]:arrow_forward:", justify="center") table.add_column("[yellow]:recycle:", justify="center") table.add_column("CPU", justify="center") table.add_column("GPU", justify="center") # Full option prints also resource requirements of jobs if full: table.add_column( ":hourglass_flowing_sand: Completed Jobs [yellow]:heavy_check_mark:", justify="center", ) # Add rows of info if dataframe exists (previously recorded experiments) if df is not None: for index in reversed(df.index): row = df.iloc[index] if row["Resource"] == "sge-cluster": resource = "SGE" elif row["Resource"] == "slurm-cluster": resource = "Slurm" elif row["Resource"] == "gcp-cloud": resource = "GCP" else: resource = "Local" if row["Type"] == "hyperparameter-search": exp_type = "search" elif row["Type"] == "multiple-configs": exp_type = "config" elif row["Type"] == "single-config": exp_type = "single" else: exp_type = row["Type"] if row["Status"] == "running": status = Spinner("dots", style="magenta") elif row["Status"] == "completed": status = "[green]:heavy_check_mark:" else: status = "[red]:heavy_multiplication_x:" if full: bar = get_progress_bar(int(row["Jobs"]), int(row["Completed Jobs"])) table.add_row( status, row["ID"], row["Date"], row["Project"][:10], row["Purpose"][:15], exp_type, resource, str(row["Seeds"]), str(row["CPUs"]), str(row["GPUs"]), bar, ) else: table.add_row( status, row["ID"], row["Date"], row["Project"][:10], row["Purpose"][:25], exp_type, resource, str(row["Seeds"]), str(row["CPUs"]), str(row["GPUs"]), ) table.border_style = "blue" table.box = box.SIMPLE_HEAD return table ================================================ FILE: mle_monitor/resource/__init__.py ================================================ from .sge import SGEResource from .slurm import SlurmResource from .gcp import GCPResource from .local import LocalResource __all__ = ["SGEResource", "SlurmResource", "GCPResource", "LocalResource"] ================================================ FILE: mle_monitor/resource/gcp.py ================================================ import time import subprocess as sp import pandas as pd from typing import Union class GCPResource(object): def __init__(self, monitor_config: Union[dict, None]): self.resource_name = "gcp-cloud" self.monitor_config = monitor_config def monitor(self): """Helper to get all utilisation data for resource.""" return self.get_data() def get_data(self): """Helper to get all utilisation data for GCP resource.""" while True: try: check_cmd = [ "gcloud", "compute", "instances", "list", "--verbosity", "error", ] out = sp.check_output(check_cmd) break except sp.CalledProcessError: time.sleep(1) # Clean up and check if vm_name is in list of all jobs job_info = out.split(b"\n")[1:-1] df_gcp = {"experiment_type": [], "status": []} for i in range(len(job_info)): decoded_job_info = job_info[i].decode("utf-8").split() df_gcp["experiment_type"].append(decoded_job_info[2]) df_gcp["status"].append(decoded_job_info[-1]) df_gcp = pd.DataFrame(df_gcp) gcp_data = {"experiment_type": [], "run": [], "stop": [], "stage": []} for jt in df_gcp.experiment_type.unique(): sub_df = df_gcp[df_gcp.experiment_type == jt] run = (sub_df["status"] == "RUNNING").sum() stop = (sub_df["status"] == "STOPPING").sum() stage = (sub_df["status"] == "STAGING").sum() gcp_data["experiment_type"].append(jt) gcp_data["run"].append(run) gcp_data["stop"].append(stop) gcp_data["stage"].append(stage) # Return list of different machine types and their status return gcp_data ================================================ FILE: mle_monitor/resource/local.py ================================================ from datetime import datetime import numpy as np from typing import Union class LocalResource(object): def __init__(self, monitor_config: Union[dict, None]): self.resource_name = "local" self.monitor_config = monitor_config def monitor(self): """Helper to get all utilisation data for local resource.""" proc_data = self.get_process_data() device_data = self.get_device_data() util_data = self.get_util_data() return proc_data, device_data, util_data def get_process_data(self): """Get process info running on local machine.""" proc_data = { "pid": [], "p_name": [], "mem_util": [], "cpu_util": [], "cmdline": [], } # Get top 5 most memory/cpu demanding processes on local machine mem_sort, cpu_sort, total_cpu, total_mem = self.get_cpu_processes() for p in cpu_sort: proc_data["pid"].append(p["pid"]) proc_data["p_name"].append(p["name"]) proc_data["mem_util"].append(p["vms"]) proc_data["cpu_util"].append(p["cpu_percent"]) proc_data["cmdline"].append(p["cmdline"]) for p in mem_sort: proc_data["pid"].append(p["pid"]) proc_data["p_name"].append(p["name"]) proc_data["mem_util"].append(p["vms"]) proc_data["cpu_util"].append(p["cpu_percent"]) proc_data["cmdline"].append(p["cmdline"]) proc_data["total_cpu_util"] = total_cpu proc_data["total_mem_util"] = total_mem return proc_data def get_cpu_processes(self, top_k: int = 6): """Get list of process sorted by Memory (MB)/CPU Usage (CPU%).""" try: import psutil except ImportError: raise ImportError( "You need to install `psutil` to monitor CPU processes." ) listOfProcObjects = [] # Iterate over the list for proc in psutil.process_iter(): try: # Fetch process details as dict pinfo = proc.as_dict( attrs=["pid", "name", "username", "cmdline"] ) pinfo["vms"] = proc.memory_info().vms / (1024 * 1024 * 1000) pinfo["cpu_percent"] = proc.cpu_percent() # Append dict to list listOfProcObjects.append(pinfo) except ( psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess, ): pass # Sort list of dict by key vms i.e. memory usage sorted_by_memory = sorted( listOfProcObjects, key=lambda procObj: procObj["vms"], reverse=True ) sorted_by_cpu = sorted( listOfProcObjects, key=lambda procObj: procObj["cpu_percent"], reverse=True, ) total_cpu = psutil.cpu_count() * psutil.cpu_percent() total_mem = psutil.virtual_memory()._asdict()["used"] / 1000000 return ( sorted_by_memory[:top_k], sorted_by_cpu[:top_k], total_cpu, total_mem, ) def get_device_data(self): """Get utilization per core.""" try: import psutil except ImportError: raise ImportError( "You need to install `psutil` to monitor CPU processes." ) device_data = { "core_id": np.arange(1, psutil.cpu_count() + 1), "percent_util": [ round(e, 1) for e in psutil.cpu_percent(interval=1, percpu=True) ], "cpu_count": psutil.cpu_count(logical=True), "logical_count": psutil.cpu_count(), } # Add GPU usage data via GPUtil try: for k, v in self.get_nvidia_gpu_data().items(): device_data[k] = v except Exception: pass return device_data def get_nvidia_gpu_data(self): """Helper function to get NVIDIA GPU utilisation.""" try: import GPUtil except ImportError: raise ImportError( "You need to install `gputil` to monitor GPU processes." ) gpus = GPUtil.getGPUs() gpu_data = { "gpu_id": [], "gpu_name": [], "gpu_load": [], "gpu_mem_util": [], "gpu_mem_total": [], "gpu_mem_used": [], } # Loop over experts and add data for gp in gpus: gpu_data["gpu_id"].append(gp.id), gpu_data["gpu_name"].append(gp.name) # Load and memory usage in % gpu_data["gpu_load"].append(round(gp.load * 100, 1)) gpu_data["gpu_mem_util"].append(round(gp.memoryUtil * 100, 1)) # Memory in GB gpu_data["gpu_mem_total"].append(round(gp.memoryTotal / 1000, 1)) gpu_data["gpu_mem_used"].append(round(gp.memoryUsed / 1000, 1)) return gpu_data def get_util_data(self): """Get memory and CPU utilisation for specific local machine.""" try: import psutil except ImportError: raise ImportError( "You need to install `psutil` to monitor CPU processes." ) num_cpus = psutil.cpu_count() total_mem = psutil.virtual_memory()._asdict()["total"] / 1000000000 used_mem = psutil.virtual_memory()._asdict()["used"] / 1000000000 util_data = { "cores": num_cpus, "cores_util": num_cpus * psutil.cpu_percent() / 100, "mem": total_mem, "mem_util": used_mem, "time_date": datetime.now().strftime("%m/%d/%y"), "time_hour": datetime.now().strftime("%H:%M:%S"), } return util_data ================================================ FILE: mle_monitor/resource/sge.py ================================================ from datetime import datetime import subprocess as sp from typing import Union import numpy as np import pandas as pd from ..utils import natural_keys class SGEResource(object): def __init__(self, monitor_config: Union[dict, None]): self.resource_name = "sge-cluster" self.monitor_config = monitor_config def monitor(self): """Helper to get all utilisation data for resource.""" user_data, job_df = self.get_user_data() queue_data = self.get_queue_data(job_df) node_data = self.get_node_data(job_df) util_data = self.get_util_data() return user_data, queue_data, util_data, node_data def get_user_data(self): """Get jobs scheduled by Slurm cluster users. Return dictionary with `users`, `total`, `run`, `wait`, `login`. """ user_data = {"user": [], "total": [], "run": [], "wait": [], "login": []} all_users = sp.check_output(["qconf", "-suserl"]).split(b"\n")[:-1] all_users = [u.decode() for u in all_users] user_cmd = [["-u", u] for u in all_users] user_cmd = [item for sublist in user_cmd for item in sublist] queue_cmd = [["-q", q] for q in self.monitor_config["queues"]] queue_cmd = [item for sublist in queue_cmd for item in sublist] # Check all users and queues in one go all_job_infos = sp.check_output(["qstat", *user_cmd, *queue_cmd]).split(b"\n")[ 2:-1 ] job_df = { "user": [], "queue": [], "status": [], "node": [], } # Clean all jobs (qlogin, qsub, etc.) + get unique users/hosts for job in all_job_infos: job_clean = job.decode().split(" ") job_clean = list(filter(None, job_clean)) host_ip = job_clean[7].split("@") job_df["user"].append(job_clean[3]) if len(host_ip) > 1: job_df["queue"].append(host_ip[0]) job_df["node"].append(host_ip[1][:-1]) else: job_df["queue"].append(None) job_df["node"].append(None) if "QLOGIN" in job_clean and job_clean[4] == "r": job_df["status"].append("LOGIN") elif "QLOGIN" not in job_clean and job_clean[4] == "r": job_df["status"].append("R") else: job_df["status"].append("PD") job_df = pd.DataFrame(job_df) # Loop over unique users and construct data to show unique_users = job_df.user.unique().tolist() for u_id in unique_users: sub_df = job_df.loc[job_df["user"] == u_id] user_data["user"].append(u_id) user_data["total"].append(sub_df.shape[0]) sub_run = sub_df.loc[sub_df["status"] == "R"] user_data["run"].append(sub_run.shape[0]) sub_wait = sub_df.loc[sub_df["status"] == "PD"] user_data["wait"].append(sub_wait.shape[0]) sub_login = sub_df.loc[sub_df["status"] == "LOGIN"] user_data["login"].append(sub_login.shape[0]) # Sort users based on total jobs in decreasing order sort_ids = np.argsort(-np.array(user_data["total"])) user_data["user"] = [user_data["user"][i] for i in sort_ids] user_data["total"] = [user_data["total"][i] for i in sort_ids] user_data["run"] = [user_data["run"][i] for i in sort_ids] user_data["wait"] = [user_data["wait"][i] for i in sort_ids] user_data["login"] = [user_data["login"][i] for i in sort_ids] return user_data, job_df def get_queue_data(self, job_df: pd.DataFrame): """Get jobs running on different SGE queues.""" host_data = {"host_id": [], "total": [], "run": [], "login": []} job_df = job_df[job_df.status != "PD"] unique_queues = job_df.queue.unique().tolist() unique_queues.sort(key=natural_keys) for h_id in unique_queues: sub_df = job_df.loc[job_df["queue"] == h_id] host_data["host_id"].append(h_id) host_data["total"].append(sub_df.shape[0]) sub_run = sub_df.loc[sub_df["status"] == "R"] host_data["run"].append(sub_run.shape[0]) sub_login = sub_df.loc[sub_df["status"] == "LOGIN"] host_data["login"].append(sub_login.shape[0]) return host_data def get_util_data(self): """Get memory and CPU utilisation for specific SGE queue.""" processes = sp.check_output(["qhost"]) all_node_infos = processes.split(b"\n")[1:-1] all_node_infos = [j.decode() for j in all_node_infos] total_cores, used_cores, total_mem, used_mem = 0, 0, 0, 0 for n_info in all_node_infos: node_clean = n_info.split() try: # Cores in threads and memory in GB total_cores += int(node_clean[2]) used_cores += float(node_clean[2]) * float(node_clean[6]) total_mem += float(node_clean[-4][:-1]) # Total memory - free memory used_mem += float(node_clean[-2][:-1]) except Exception: pass util_data = { "cores": total_cores, "cores_util": used_cores, "mem": total_mem, "mem_util": used_mem, "time_date": datetime.now().strftime("%m/%d/%y"), "time_hour": datetime.now().strftime("%H:%M:%S"), } return util_data def get_node_data(self, job_df: pd.DataFrame): """Get jobs running on different SGE cluster nodes.""" host_data = {"host_id": [], "total": [], "run": [], "login": []} job_df = job_df[job_df.status != "PD"] unique_nodes = job_df.node.unique().tolist() unique_nodes.sort(key=natural_keys) for h_id in unique_nodes: sub_df = job_df.loc[job_df["node"] == h_id] host_data["host_id"].append(h_id) host_data["total"].append(sub_df.shape[0]) sub_run = sub_df.loc[sub_df["status"] == "R"] host_data["run"].append(sub_run.shape[0]) sub_login = sub_df.loc[sub_df["status"] == "LOGIN"] host_data["login"].append(sub_login.shape[0]) return host_data ================================================ FILE: mle_monitor/resource/slurm.py ================================================ from datetime import datetime import subprocess as sp import pandas as pd import numpy as np from typing import Union from ..utils import natural_keys class SlurmResource(object): def __init__(self, monitor_config: Union[dict, None]): self.resource_name = "slurm-cluster" self.monitor_config = monitor_config def monitor(self): """Helper to get all utilisation data for resource.""" user_data, job_df = self.get_user_data() host_data = self.get_partition_data(job_df) node_data = self.get_node_data(job_df) util_data = self.get_util_data() return user_data, host_data, util_data, node_data def get_user_data(self): """Get jobs scheduled by Slurm cluster users.""" user_data = {"user": [], "total": [], "run": [], "wait": [], "login": []} # Get squeue output in detailed form processes = sp.check_output( [ "squeue", "-o", '"%.20P %.20u %.2t %.10M %.6D %C %m %N"', "-p", (",").join(self.monitor_config["partitions"]), ] ) all_job_infos = processes.split(b"\n")[1:-1] all_job_infos = [j.decode() for j in all_job_infos] job_df = { "user": [], "partition": [], "status": [], "node": [], "run_time": [], "num_cores": [], "min_memory": [], } # Loop over jobs and extract relevant data into dataframe for job in all_job_infos: job_clean = job.split()[1:] job_df["user"].append(job_clean[1]) job_df["partition"].append(job_clean[0]) job_df["status"].append(job_clean[2]) job_df["run_time"].append(job_clean[3]) job_df["node"].append(job_clean[7][:-1]) job_df["num_cores"].append(job_clean[5]) job_df["min_memory"].append(job_clean[6]) job_df = pd.DataFrame(job_df) # Loop over unique users and construct data to show unique_users = job_df.user.unique().tolist() for u_id in unique_users: sub_df = job_df.loc[job_df["user"] == u_id] user_data["user"].append(u_id) user_data["total"].append(sub_df.shape[0]) sub_run = sub_df.loc[sub_df["status"] == "R"] user_data["run"].append(sub_run.shape[0]) sub_wait = sub_df.loc[sub_df["status"] == "PD"] # "CG" is completing status user_data["wait"].append(sub_wait.shape[0]) user_data["login"].append(0) # Sort users based on total jobs in decreasing order sort_ids = np.argsort(-np.array(user_data["total"])) user_data["user"] = [user_data["user"][i] for i in sort_ids] user_data["total"] = [user_data["total"][i] for i in sort_ids] user_data["run"] = [user_data["run"][i] for i in sort_ids] user_data["wait"] = [user_data["wait"][i] for i in sort_ids] user_data["login"] = [user_data["login"][i] for i in sort_ids] return user_data, job_df def get_partition_data(self, job_df: pd.DataFrame): """Get jobs running on different Slurm cluster partitions.""" host_data = {"host_id": [], "total": [], "run": [], "login": []} unique_partitions = job_df.partition.unique().tolist() unique_partitions.sort(key=natural_keys) for h_id in unique_partitions: sub_df = job_df.loc[job_df["partition"] == h_id] host_data["host_id"].append(h_id) host_data["total"].append(sub_df.shape[0]) sub_run = sub_df.loc[sub_df["status"] == "R"] host_data["run"].append(sub_run.shape[0]) host_data["login"].append(0) return host_data def get_util_data(self): """Get memory and CPU utilisation for specific slurm partition.""" # Get squeue output in detailed form processes = sp.check_output(["sinfo", "--Node", "-o", '"%.20P %N %c %O %m %e"']) all_node_infos = processes.split(b"\n")[1:-1] all_node_infos = [j.decode() for j in all_node_infos] total_cores, used_cores, total_mem, used_mem = 0, 0, 0, 0 for n_info in all_node_infos: node_clean = n_info.split()[1:] try: # Cores in threads and memory in GB total_cores += int(node_clean[2]) used_cores += float(node_clean[2]) * float(node_clean[3]) / 100 total_mem += float(node_clean[4]) / 1000 # Total memory - free memory used_mem += ( float(node_clean[4]) / 1000 - float(node_clean[5][:-1]) / 1000 ) except Exception: pass util_data = { "cores": total_cores, "cores_util": used_cores, "mem": total_mem, "mem_util": used_mem, "time_date": datetime.now().strftime("%m/%d/%y"), "time_hour": datetime.now().strftime("%H:%M:%S"), } return util_data def get_node_data(self, job_df: pd.DataFrame): """Get jobs running on different Slurm cluster nodes.""" host_data = {"host_id": [], "total": [], "run": [], "login": []} unique_nodes = job_df.node.unique().tolist() unique_nodes.sort(key=natural_keys) for h_id in unique_nodes: sub_df = job_df.loc[job_df["node"] == h_id] host_data["host_id"].append(h_id) host_data["total"].append(sub_df.shape[0]) sub_run = sub_df.loc[sub_df["status"] == "R"] host_data["run"].append(sub_run.shape[0]) host_data["login"].append(0) return host_data # squeue -p partition_name # sacct -j job_id (get resource!) ================================================ FILE: mle_monitor/utils/__init__.py ================================================ from .tracker import Tracker from .helpers import load_json_config, load_yaml_config, natural_keys, setup_logger from .gcs_zip import send_gcloud_zip, get_gcloud_zip __all__ = [ "Tracker", "load_json_config", "load_yaml_config", "natural_keys", "setup_logger", "send_gcloud_zip", "get_gcloud_zip", ] ================================================ FILE: mle_monitor/utils/gcs_zip.py ================================================ import os import glob import zipfile from typing import Union from .helpers import setup_logger def send_dir_gcp( cloud_settings: dict, local_dir: Union[str, None] = None, number_of_connect_tries: int = 5, ): """Send entire dir (recursively) to Google Cloud Storage Bucket.""" try: from google.cloud import storage except ImportError: raise ImportError( "You need to install `google-cloud-storage` to use GCP buckets." ) logger = setup_logger() for i in range(number_of_connect_tries): try: client = storage.Client(cloud_settings["project_name"]) bucket = client.get_bucket( cloud_settings["bucket_name"], timeout=20 ) except Exception: logger.info( f"Attempt {i+1}/{number_of_connect_tries}" " - Failed sending to GCloud Storage" ) def upload_single_file(local_path, gcs_path, bucket): # Recursively upload the folder structure if os.path.isdir(local_path): for local_file in glob.glob(os.path.join(local_path, "**")): if not os.path.isfile(local_file): upload_single_file( local_file, os.path.join(gcs_path, os.path.basename(local_file)), bucket, ) else: remote_path = os.path.join( gcs_path, local_file[1 + len(local_path) :] ) blob = bucket.blob(remote_path) blob.upload_from_filename(local_file) # Only upload single file - e.g. zip compressed experiment else: remote_path = gcs_path blob = bucket.blob(remote_path) blob.upload_from_filename(local_path) if local_dir is None: local_dir = os.getcwd() upload_single_file(local_dir, cloud_settings["remote_dir"], bucket) def copy_dir_gcp( cloud_settings: dict, remote_dir: str, local_dir: Union[str, None] = None, number_of_connect_tries: int = 5, ): """Download entire dir (recursively) from Google Cloud Storage Bucket.""" try: from google.cloud import storage except ImportError: raise ImportError( "You need to install `google-cloud-storage` to use GCP buckets." ) logger = setup_logger() for i in range(number_of_connect_tries): try: client = storage.Client(cloud_settings["project_name"]) bucket = client.get_bucket( cloud_settings["bucket_name"], timeout=20 ) except Exception: logger.info( f"Attempt {i+1}/{number_of_connect_tries}" " - Failed sending to GCloud Storage" ) blobs = bucket.list_blobs(prefix=remote_dir) # Get list of files blobs = list(blobs) if local_dir is None: path = os.path.normpath(remote_dir) local_dir = path.split(os.sep)[-1] # Recursively download the folder structure if len(blobs) > 1: local_path = os.path.expanduser(local_dir) for blob in blobs: filename = blob.name[len(remote_dir) :] local_f_path = os.path.join(local_path, filename) dir_path = os.path.dirname(local_f_path) if not os.path.exists(dir_path): try: os.makedirs(dir_path) except Exception: pass temp_file = local_path + filename temp_dir = os.path.split(temp_file)[0] if not os.path.exists(temp_dir): os.makedirs(temp_dir) blob.download_to_filename(temp_file) # Only download single file - e.g. zip compressed experiment else: for blob in blobs: filename = blob.name.split("/")[1] blob.download_to_filename(filename) def zipdir(path: str, zip_fname: str): """Zip a directory to upload afterwards to GCloud Storage.""" # ziph is zipfile handle ziph = zipfile.ZipFile(zip_fname, "w", zipfile.ZIP_DEFLATED) # Get rid of redundant part of path prefix_len = len(path) for root, dirs, files in os.walk(path): for file in files: ziph.write( os.path.join(root, file), os.path.join(root[prefix_len + 1 :], file), ) ziph.close() def send_gcloud_zip( cloud_settings: dict, local_fname: str, local_zip_fname: str, delete_after_upload: bool = True, ): """Zip & upload experiment dir to Gcloud storage.""" # 1. Get experiment hash from the protocol db gcloud_hash_fname = "experiments/" + local_zip_fname # 2. Zip compress the experiment zipdir(local_fname, local_zip_fname) # 3. Upload the zip file to the GCS bucket cloud_settings["remote_dir"] = gcloud_hash_fname send_dir_gcp(cloud_settings, local_zip_fname) # 4. Delete the .zip file & the folder if desired if delete_after_upload: os.remove(local_zip_fname) def get_gcloud_zip( cloud_settings: dict, hash_to_store: str, experiment_id: str, local_dir_name: Union[None, str] = None, ): """Download zipped experiment from GCS. Unpack & clean up.""" logger = setup_logger() # Get unique hash id & download the experiment results folder local_hash_fname = hash_to_store + ".zip" gcloud_hash_fname = "experiments/" + local_hash_fname copy_dir_gcp(cloud_settings, gcloud_hash_fname) # Unzip the retrieved file if local_dir_name is None: with zipfile.ZipFile(local_hash_fname, "r") as zip_ref: zip_ref.extractall(experiment_id) else: with zipfile.ZipFile(local_hash_fname, "r") as zip_ref: zip_ref.extractall(local_dir_name) # Delete the zip file os.remove(local_hash_fname) # Goodbye message if successful logger.info(f"Successfully retrieved {experiment_id} - from GCS") logger.info(f"Remote Path: {gcloud_hash_fname}") return ================================================ FILE: mle_monitor/utils/helpers.py ================================================ import yaml import commentjson from dotmap import DotMap import re import logging from rich.logging import RichHandler def load_yaml_config(config_fname: str, return_dotmap: bool = False): """Load in YAML config file.""" with open(config_fname) as file: yaml_config = yaml.load(file, Loader=yaml.FullLoader) if not return_dotmap: return yaml_config else: return DotMap(yaml_config) def load_json_config(config_fname: str, return_dotmap: bool = False): """Load in JSON config file.""" json_config = commentjson.loads(open(config_fname, "r").read()) if not return_dotmap: return json_config else: return DotMap(json_config) def atoi(text): return int(text) if text.isdigit() else text def natural_keys(text): """ alist.sort(key=natural_keys) sorts in human order http://nedbatchelder.com/blog/200712/human_sorting.html (See Toothy's implementation in the comments) """ return [atoi(c) for c in re.split(r"(\d+)", text)] def setup_logger(logging_level: int = logging.INFO): logging.basicConfig( level=logging_level, format="%(message)s", datefmt="[%X]", handlers=[RichHandler()], ) logger = logging.getLogger() logger.setLevel(logging_level) return logger ================================================ FILE: mle_monitor/utils/tracker.py ================================================ import os import numpy as np class Tracker(object): def __init__(self, fname=".mle_tracker.npy"): """MLE Tracker for Resource Utilization & Running Experiments.""" self.fname = os.path.join(os.path.expanduser("~"), fname) # Storage limit & reload previous stored data self.limit = 100000 self.load() def update(self, util_data: dict, save: bool = True): # Add utilisation data to storage dictionaries self.times_date.append(util_data["time_date"]) self.times_hour.append(util_data["time_hour"]) self.mem_util.append(util_data["mem_util"] / util_data["mem"]) self.cpu_util.append(util_data["cores_util"] / util_data["cores"]) self.moving_window() self.save() return { "times_date": self.times_date, "times_hour": self.times_hour, "rel_mem_util": self.mem_util, "rel_cpu_util": self.cpu_util, } def moving_window(self): """Truncate/Limit memory to approx. last 27 hours.""" self.times_date = self.times_date[: self.limit] self.times_hour = self.times_hour[: self.limit] self.mem_util = self.mem_util[: self.limit] self.cpu_util = self.cpu_util[: self.limit] def load(self): """Creates a hidden file storing usage time series data.""" try: data = np.load(self.fname) self.mem_util = data[:, 0].astype(float).tolist() self.cpu_util = data[:, 1].astype(float).tolist() self.times_date = data[:, 2].tolist() self.times_hour = data[:, 3].tolist() except Exception: # Start storing utilisation history self.mem_util = [] self.cpu_util = [] self.times_date = [] self.times_hour = [] def save(self): """Save recent usage time series data.""" stacked = np.stack( [self.mem_util, self.cpu_util, self.times_date, self.times_hour], axis=1 ) np.save(self.fname, stacked) ================================================ FILE: requirements/requirements-examples.txt ================================================ mle-logging mle-hyperopt ================================================ FILE: requirements/requirements-test.txt ================================================ psutil gputil ================================================ FILE: requirements/requirements.txt ================================================ pyyaml commentjson dotmap numpy pandas rich pickledb plotext==4.0.0 Pillow ================================================ FILE: setup.py ================================================ try: from setuptools import setup, find_packages except ImportError: from distutils.core import setup, find_packages import re import os from typing import List CURRENT_DIR = os.path.abspath(os.path.dirname(__file__)) with open(os.path.join(CURRENT_DIR, "README.md"), encoding="utf-8") as f: long_description = f.read() def parse_requirements(path: str) -> List[str]: with open(os.path.join(CURRENT_DIR, path)) as f: return [ line.rstrip() for line in f if not (line.isspace() or line.startswith("#")) ] VERSIONFILE = "mle_monitor/_version.py" verstrline = open(VERSIONFILE, "rt").read() VSRE = r"^__version__ = ['\"]([^'\"]*)['\"]" mo = re.search(VSRE, verstrline, re.M) if mo: verstr = mo.group(1) else: raise RuntimeError("Unable to find version string in %s." % (VERSIONFILE,)) git_tar = f"https://github.com/mle-infrastructure/mle-monitor/archive/v{verstr}.tar.gz" setup( name="mle_monitor", version=verstr, author="Robert Tjarko Lange", author_email="robertlange0@gmail.com", description="Machine Learning Experiment Resource Monitoring", long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/mle-infrastructure/mle-monitor", download_url=git_tar, classifiers=[ "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", "Topic :: Scientific/Engineering :: Artificial Intelligence", ], packages=find_packages(), include_package_data=True, zip_safe=False, platforms="any", python_requires=">=3.6", install_requires=parse_requirements( os.path.join(CURRENT_DIR, "requirements", "requirements.txt") ), tests_require=parse_requirements( os.path.join(CURRENT_DIR, "requirements", "requirements-test.txt") ), extras_require={ "examples": parse_requirements( os.path.join( CURRENT_DIR, "requirements", "requirements-examples.txt" ) ), "full": ["psutil", "gputil", "google-cloud-storage", "GitPython"], }, ) ================================================ FILE: tests/fixtures/base_config.json ================================================ { "train_config": {"lrate": 0.1}, "model_config": {"num_layers": 5}, "log_config": {"time_to_track": ["step_counter"], "what_to_track": ["loss"], "time_to_print": ["step_counter"], "what_to_print": ["loss"], "print_every_k_updates": 10, "overwrite_experiment_dir": 1} } ================================================ FILE: tests/test_dashboard.py ================================================ from mle_monitor import MLEProtocol, MLEResource, MLEDashboard def test_dashboard(): # Test data collection and layout generation resource = MLEResource(resource_name="local") protocol = MLEProtocol(protocol_fname="mle_protocol.db") dashboard = MLEDashboard(protocol, resource) dashboard.snapshot() ================================================ FILE: tests/test_protocol.py ================================================ from mle_monitor import MLEProtocol meta_data = { "purpose": "Test MLEProtocol", "project_name": "MNIST", "exec_resource": "local", "experiment_dir": "log_dir", "experiment_type": "hyperparameter-search", "base_fname": "main.py", "config_fname": "tests/fixtures/base_config.json", "num_seeds": 5, "num_total_jobs": 10, "num_jobs_per_batch": 5, "num_job_batches": 2, "time_per_job": "00:05:00", # days-hours-minutes "num_cpus": 2, "num_gpus": 1, } def test_add_protocol(): # Add experiment to new protocol and add data protocol = MLEProtocol(protocol_fname="mle_protocol.db") e_id = protocol.add(meta_data, save=False) proto_data = protocol.get(e_id) for k, v in meta_data.items(): assert proto_data[k] == v return def test_load_protocol(): # Reload database - assert correctness of data protocol = MLEProtocol(protocol_fname="tests/fixtures/mle_protocol_test.db") last_data = protocol.get() for k, v in meta_data.items(): if k not in ["config_fname", "purpose"]: assert last_data[k] == v # Check adding of new data e_id = protocol.add(meta_data, save=False) proto_data = protocol.get(e_id) for k, v in meta_data.items(): assert proto_data[k] == v return def test_update_delete_abort_protocol(): # Change some entry of DB store and check it protocol = MLEProtocol(protocol_fname="mle_protocol.db") e_id = protocol.add(meta_data, save=False) # Update some element in the database protocol.update(e_id, "exec_resource", "slurm-cluster", save=False) assert protocol.get(e_id, "exec_resource") == "slurm-cluster" # Abort the experiment - changes status protocol.abort(e_id, save=False) assert protocol.status(e_id) == "aborted" return def test_monitor_protocol(): # Check that all required keys are in collected data protocol = MLEProtocol(protocol_fname="mle_protocol.db") _ = protocol.add(meta_data, save=False) # Get the monitoring data - used later in dashboard data = protocol.monitor() total_keys = [ "total", "run", "done", "aborted", "sge", "slurm", "gcp", "local", "report_gen", "gcs_stored", "retrieved", ] for k in total_keys: assert k in data["total_data"].keys() last_keys = ["e_id", "e_dir", "e_type", "e_script", "e_config", "report_gen"] for k in last_keys: assert k in data["last_data"].keys() time_keys = [ "total_jobs", "total_batches", "jobs_per_batch", "time_per_batch", "start_time", "stop_time", "duration", ] for k in time_keys: assert k in data["time_data"].keys() return ================================================ FILE: tests/test_resource.py ================================================ from mle_monitor import MLEResource def test_resource(): # Instantiate local resource and get usage data resource = MLEResource(resource_name="local") resource_data = resource.monitor() assert "resource_name" in resource_data.keys() assert "user_data" in resource_data.keys() assert "host_data" in resource_data.keys() assert "util_data" in resource_data.keys() return