Repository: cnstark/gputasker Branch: main Commit: 36f4a7232af1 Files: 45 Total size: 52.0 KB Directory structure: gitextract_e6ayn4we/ ├── .github/ │ └── workflows/ │ └── docker-build.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── base/ │ ├── __init__.py │ ├── admin.py │ ├── apps.py │ ├── migrations/ │ │ └── __init__.py │ ├── models.py │ ├── tests.py │ ├── utils.py │ └── views.py ├── docker-compose.yml ├── entrypoint.sh ├── gpu_info/ │ ├── __init__.py │ ├── admin.py │ ├── apps.py │ ├── migrations/ │ │ └── __init__.py │ ├── models.py │ ├── tests.py │ ├── utils.py │ └── views.py ├── gpu_tasker/ │ ├── .gitignore │ ├── __init__.py │ ├── asgi.py │ ├── email_settings_sample.py │ ├── settings.py │ ├── urls.py │ └── wsgi.py ├── main.py ├── manage.py ├── nginx/ │ └── conf.d/ │ └── gpu_tasker.conf ├── notification/ │ ├── __init__.py │ └── email_notification.py ├── static/ │ └── css/ │ └── admin/ │ └── custom.css ├── task/ │ ├── __init__.py │ ├── admin.py │ ├── apps.py │ ├── migrations/ │ │ └── __init__.py │ ├── models.py │ ├── tests.py │ ├── utils.py │ └── views.py └── uwsgi/ └── uwsgi.ini ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/docker-build.yml ================================================ name: Build Docker Image CI on: push: branches: - "main" jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - name: Login DockerHub run: docker login --username=${{ secrets.DOCKER_USERNAME }} --password=${{ secrets.DOCKER_PASSWORD }} - name: Build docker image run: docker build -t ${{ secrets.DOCKER_USERNAME }}/gputasker:latest . - name: Push docker image run: docker push ${{ secrets.DOCKER_USERNAME }}/gputasker:latest ================================================ FILE: .gitignore ================================================ __pycache__ .vscode running_log private_key server_log */migrations/* !*/migrations/__init__.py *.sqlite3* .idea ================================================ FILE: Dockerfile ================================================ FROM python:3.9 RUN apt update && \ apt install -y openssh-client && \ apt clean && \ rm -rf /var/lib/apt/lists/* RUN pip install django django-simpleui mysqlclient uwsgi && \ rm -r /root/.cache/pip ENV DOCKER_DEPLOY 1 ADD https://raw.githubusercontent.com/vishnubob/wait-for-it/master/wait-for-it.sh / ADD . /gpu_tasker WORKDIR /gpu_tasker RUN chmod +x /wait-for-it.sh && \ chmod +x entrypoint.sh VOLUME /gpu_tasker/server_log VOLUME /gpu_tasker/running_log VOLUME /gpu_tasker/private_key VOLUME /gpu_tasker/static_collected VOLUME /gpu_tasker/uwsgi/log EXPOSE 9009 ENTRYPOINT ["/wait-for-it.sh", "mariadb:3306", "-t", "180", "--", "./entrypoint.sh"] ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2020 Tony Stark Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # GPU Tasker 轻量好用的GPU机群任务调度工具 [![simpleui](https://img.shields.io/badge/developing%20with-Simpleui-2077ff.svg)](https://github.com/newpanjing/simpleui) [![docker build](https://github.com/cnstark/gputasker/actions/workflows/docker-build.yml/badge.svg)](https://hub.docker.com/r/cnstark/gputasker) ## 介绍 GPU Tasker是一款GPU任务调度工具,适用于GPU机群或单机环境,科学地调度每一项任务,深度学习工作者的福音。 **警告:不建议将本工具用于实验室抢占GPU,这将会使你的同学或师兄盯上你(狗头)** ## 开始使用 ### 环境准备 在机群环境下,将GPU Tasker安装在机群环境下的一台服务器或PC,安装GPU Tasker的服务器成为Master,其余服务器称为Node,Master可以通过ssh连接所有Node服务器。**建议Node服务器连接NAS或拥有共享目录,并连接LDAP。** 安装django、django-simpleui ```shell pip install django django-simpleui ``` ### 部署GPU Tasker GPU Tasker支持手动部署与Docker部署。 #### 手动部署 * 在Master服务器clone本项目 ```shell git clone https://github.com/cnstark/gputasker.git cd gputasker ``` * 编辑`gpu_tasker/settings.py`,编辑数据库等django基本设置,如果是单用户使用或机群规模较小时(或者服务器安装MySQL困难),使用sqlite即可。 * 初始化项目数据库 ```shell python manage.py makemigrations python manage.py migrate ``` * 创建超级用户 ```shell python manage.py createsuperuser ``` 根据提示输入信息,完成创建。 * 启动服务 ```shell python manage.py runserver --insecure 0.0.0.0:8888 ``` * 启动主进程 ```shell python main.py ``` #### Docker部署 * 安装[Docker](https://docs.docker.com/get-docker/)与[docker-compose](https://docs.docker.com/compose/install/) * 在Master服务器clone本项目 ```shell git clone https://github.com/cnstark/gputasker.git cd gputasker ``` * 启动GPUTasker ```shell sudo docker-compose up -d ``` * 创建超级用户 注意:初次使用时需要等待初始化完成后才能创建超级用户,等待时间约30秒。当`http://your_server:8888/admin`可以正常访问后再执行: ```shell sudo docker exec -it gputasker_django python manage.py createsuperuser ``` 根据提示输入信息,完成创建。 ### 基本设置 访问`http://your_server:8888/admin`,登录管理后台。 ![home](.assets/home.png) 添加`用户设置`,输入服务器用户名与私钥。Master通过私钥登录Node服务器,需要将私钥添加至Node服务器`authorized_keys`。 暂只支持每个服务器使用相同的用户名,后续版本迭代可能会支持。 ![home](.assets/user_config.png) ### 添加Node节点 点击`GPU服务器`,添加Node节点ip或域名,点击保存。保存后会自动更新node节点信息,包括hostname以及GPU信息 ![home](.assets/add_server.png) 选项说明 * 是否可用:服务器当前状态是否可用。若连接失败或无法获取GPU状态则会被自动置为False并不再被调度。 * 是否可调度:服务器是否参与任务调度。若服务器有其他用途(被人独占等),手动设置此项为False,该服务器不再被调度。 ### 添加任务 点击`GPU任务`,输入任务信息并保存。状态为`准备就绪`的任务会在服务器满足需求时执行。 ![home](.assets/add_task.png) 选项说明 * 工作目录:执行命令时所在的工作目录。 * 命令:执行的命令。支持多行命令,如: ```shell source venv/pytorch/bin/activate python train.py ``` 注意:使用conda环境时,由于ssh远程执行无法获取conda环境变量导致`conda activate`失败,需要先激活conda再激活虚拟环境。或者使用`python`绝对路径。例如: ```shell source /path/to/anaconda3/bin/activate conda activate pytorch python train.py # 或 /path/to/anaconda3/envs/pytorch/bin/python train.py ``` * GPU数量需求:任务所需的GPU数量。当任务被调度时,会根据所需GPU数量自动设置`CUDA_VISIBLE_DEVICES`环境变量,因此任务命令中不要手动设置`CUDA_VISIBLE_DEVICES`,避免调度失败。 * 独占显卡:当该选项为True时,只会调度没有进程占用的显卡。 * 显存需求:任务在单GPU上需要的显存。设置时保证任务可以运行即可,不需要准确。 * 利用率需求:任务在单GPU上需要的空闲利用率。 注意:显存需求和利用率需求只在`独占显卡`为False时生效,当GPU满足显存需求和利用率时会参与调度。仅用于GPU全被占满需要强占的情况,一般情况下建议勾选`独占显卡`。 * 指定服务器:选择任务运行的服务器。若该选项为空,则在所有可调度服务器中寻找满足需求的服务器;否则只在指定服务器上等待GPU满足条件时调度。 * 优先级:任务调度的优先级。功能尚未支持。 * 状态:当前任务状态。状态为`准备就绪`时,任务会被调度。 任务运行后可以通过`GPU任务运行记录`查看任务状态与Log。 ## 通知设置 GPUTasker支持邮件通知,任务开始运行和结束时向用户发送邮件提醒。 ### 开启邮箱SMTP功能 进入邮箱后台,开启SMTP功能,并获取SMTP密钥。不同邮件服务商配置方式不同,具体开启方法参考邮箱帮助。 ### 配置邮件通知 复制`email_settings_sample.py`为`email_settings.py`。 ```shell cd gpu_tasker cp email_settings_sample.py email_settings.py ``` 编辑`email_settings.py`,填写SMTP服务器、端口、邮箱名和密码: ```python # 以163邮箱为例 EMAIL_BACKEND = 'django.core.mail.backends.smtp.EmailBackend' # SMTP服务器 EMAIL_HOST = 'smtp.163.com' # SMTP服务器端口 EMAIL_PORT = 465 # 邮箱名 EMAIL_HOST_USER = 'xxx@163.com' # SMTP密钥(部分邮箱与邮箱密码相同) EMAIL_HOST_PASSWORD = 'xxx' EMAIL_USE_SSL = True EMAIL_USE_LOCALTIME = True DEFAULT_FROM_EMAIL = 'GPUTasker<{}>'.format(EMAIL_HOST_USER) SERVER_EMAIL = EMAIL_HOST_USER ``` ### 配置收信邮箱 收信邮箱为Django用户`电子邮件地址`,在后台进行配置。 ![user_email](.assets/user_email.png) ## 更新GPUTasker GPUTasker可能包含数据表的改动,更新后请务必更新数据表以及**重新启动main.py**。 ```shell # 拉取最新代码 git pull # 更新数据表 python manage.py makemigrations python manage.py migrate # 重新启动main.py # 1. CTRL + C结束main.py # 2. 重新启动 python main.py ``` ## QQ交流群 * 群号:634800826 ![qq group](.assets/qrcode_qq.jpg) ## 写在后面 在一次急需跑一个程序却在实验室几十台服务器上找不到一块显卡时萌生了这个想法,花半天时间写了这个项目的第一版,在显卡空闲时“抢”显卡执行我的程序,当时就决定开源,造福像我一样抢不到显卡的人。使用过程中经过了几天的完善,逐渐变成了一个支持多用户的GPU的任务调度工具,也更希望任务可以被有序调度而不是所有人疯狂的抢,这也是项目未来的愿景。 由于项目开发比较仓促,存在很多不完善的地方。如果在使用过程中有任何意见或建议,请提交issue或者pr。让我们共同完善这个新生的项目。 ## 致谢 感谢[simpleui](https://github.com/newpanjing/simpleui)团队开发的强大工具。 ================================================ FILE: base/__init__.py ================================================ default_app_config = 'base.apps.BaseConfig' ================================================ FILE: base/admin.py ================================================ import os, stat from django.contrib import admin from .models import UserConfig from gpu_tasker.settings import PRIVATE_KEY_DIR @admin.register(UserConfig) class UserConfigAdmin(admin.ModelAdmin): list_display = ('user', 'server_username',) search_fields = ('user', 'server_username',) list_display_links = ('user',) readonly_fields = ('user', 'server_private_key_path',) ordering = ('user',) def get_queryset(self, request): qs = super().get_queryset(request) if request.user.is_superuser: return qs return qs.filter(user=request.user) def has_add_permission(self, request): return True def save_model(self, request, obj, form, change): if not change: obj.user = request.user obj.server_private_key_path = os.path.join(PRIVATE_KEY_DIR, obj.server_username + '_pk') # format private key obj.server_private_key = obj.server_private_key.replace('\r\n', '\n') if obj.server_private_key[-1] != '\n': obj.server_private_key = obj.server_private_key + '\n' with open(obj.server_private_key_path, 'w') as f: f.write(obj.server_private_key) os.chmod(obj.server_private_key_path, stat.S_IWUSR | stat.S_IREAD) super().save_model(request, obj, form, change) ================================================ FILE: base/apps.py ================================================ from django.apps import AppConfig class BaseConfig(AppConfig): name = 'base' verbose_name = '基本设置' ================================================ FILE: base/migrations/__init__.py ================================================ ================================================ FILE: base/models.py ================================================ from django.db import models from django.contrib.auth.models import User class UserConfig(models.Model): user = models.OneToOneField(User, verbose_name='用户', on_delete=models.CASCADE, related_name='config', primary_key=True) server_username = models.CharField('服务器用户名', max_length=100) server_private_key = models.TextField('私钥') server_private_key_path = models.FilePathField(path='private_key', verbose_name="私钥文件", blank=True, null=True) class Meta: verbose_name = '用户设置' verbose_name_plural = '用户设置' ================================================ FILE: base/tests.py ================================================ from django.test import TestCase # Create your tests here. ================================================ FILE: base/utils.py ================================================ from django.contrib.auth.models import User def get_admin_config(): admin_users = User.objects.filter(is_superuser=True) if admin_users.count() == 0: raise RuntimeError('Please create a superuser!') if admin_users[0].config is None: raise RuntimeError( 'Please login admin site and create a config for user {}!'.format(admin_users[0].username) ) return admin_users[0].config.server_username, admin_users[0].config.server_private_key_path ================================================ FILE: base/views.py ================================================ from django.shortcuts import render # Create your views here. ================================================ FILE: docker-compose.yml ================================================ version: "3" services: mariadb: image: mariadb:10.1 container_name: gputasker_mariadb environment: MYSQL_ROOT_PASSWORD: gpu_tasker MYSQL_USER: gpu_tasker MYSQL_PASSWORD: gpu_tasker MYSQL_DATABASE: gpu_tasker restart: unless-stopped volumes: - ./mysql:/var/lib/mysql - /etc/localtime:/etc/localtime:ro - /etc/timezone:/etc/timezone:ro networks: - gputasker_net gputasker: image: cnstark/gputasker:latest container_name: gputasker_django volumes: - ./server_log:/gpu_tasker/server_log - ./running_log:/gpu_tasker/running_log - ./private_key:/gpu_tasker/private_key - ./static_collected:/gpu_tasker/static_collected - ./uwsgi/log:/gpu_tasker/uwsgi/log - /etc/localtime:/etc/localtime:ro - /etc/timezone:/etc/timezone:ro depends_on: - mariadb networks: - gputasker_net nginx: image: nginx:latest container_name: gputasker_nginx volumes: - ./nginx/conf.d:/etc/nginx/conf.d - ./nginx/log:/var/log/nginx - ./static_collected:/static_collected - ./uwsgi/log:/gpu_tasker/uwsgi/log - /etc/localtime:/etc/localtime:ro - /etc/timezone:/etc/timezone:ro ports: - '8888:80' depends_on: - gputasker networks: - gputasker_net networks: gputasker_net: ================================================ FILE: entrypoint.sh ================================================ python manage.py makemigrations python manage.py migrate python manage.py collectstatic --no-input uwsgi --ini /gpu_tasker/uwsgi/uwsgi.ini python main.py ================================================ FILE: gpu_info/__init__.py ================================================ default_app_config = 'gpu_info.apps.GpuInfoConfig' ================================================ FILE: gpu_info/admin.py ================================================ from django.contrib import admin from .models import GPUServer, GPUInfo class GPUInfoInline(admin.TabularInline): model = GPUInfo fields = ('index', 'name', 'utilization', 'memory_usage', 'usernames', 'complete_free', 'update_at') readonly_fields = ('index', 'name', 'utilization', 'memory_usage', 'usernames', 'complete_free', 'update_at') show_change_link = True def usernames(self, obj): return obj.usernames() def memory_usage(self, obj): memory_total = obj.memory_total memory_used = obj.memory_used return '{:d} / {:d} MB ({:.0f}%)'.format(memory_used, memory_total, memory_used / memory_total * 100) memory_usage.short_description = '显存占用率' usernames.short_description = '使用者' def get_extra(self, request, obj, **kwargs): return 0 def has_add_permission(self, request, obj): return False def has_change_permission(self, request, obj): return False def has_delete_permission(self, request, obj): return False @admin.register(GPUServer) class GPUServerAdmin(admin.ModelAdmin): list_display = ('ip', 'hostname', 'port', 'valid', 'can_use') list_editable = ('can_use',) search_fields = ('ip', 'hostname', 'port', 'valid', 'can_use') list_display_links = ('ip',) inlines = (GPUInfoInline,) ordering = ('ip',) readonly_fields = ('hostname',) class Media: # custom css css = { 'all': ('css/admin/custom.css', ) } def has_add_permission(self, request): return request.user.is_superuser @admin.register(GPUInfo) class GPUInfoAdmin(admin.ModelAdmin): list_display = ('index', 'name', 'server', 'utilization', 'memory_usage', 'usernames', 'complete_free', 'update_at') list_filter = ('server', 'name', 'complete_free') search_fields = ('uuid', 'name', 'memory_used', 'server',) list_display_links = ('name',) ordering = ('server', 'index') readonly_fields = ('uuid', 'name', 'index', 'utilization', 'memory_total', 'memory_used','server', 'processes', 'use_by_self', 'complete_free', 'update_at') def usernames(self, obj): return obj.usernames() def has_add_permission(self, request): return False def memory_usage(self, obj): memory_total = obj.memory_total memory_used = obj.memory_used return '{:d} / {:d} MB ({:.0f}%)'.format(memory_used, memory_total, memory_used / memory_total * 100) memory_usage.short_description = '显存占用率' usernames.short_description = '使用者' ================================================ FILE: gpu_info/apps.py ================================================ from django.apps import AppConfig class GpuInfoConfig(AppConfig): name = 'gpu_info' verbose_name = 'GPU管理' ================================================ FILE: gpu_info/migrations/__init__.py ================================================ ================================================ FILE: gpu_info/models.py ================================================ import json from django.db import models class GPUServer(models.Model): ip = models.CharField('IP地址', max_length=50) hostname = models.CharField('主机名', max_length=50, blank=True, null=True) port = models.PositiveIntegerField('端口', default=22) valid = models.BooleanField('是否可用', default=True) can_use = models.BooleanField('是否可调度', default=True) # TODO(Yuhao Wang): CPU使用率 class Meta: ordering = ('ip',) verbose_name = 'GPU服务器' verbose_name_plural = 'GPU服务器' unique_together = (('ip', 'port'),) def __str__(self): return '{}:{:d}'.format(self.ip, self.port) def get_available_gpus(self, gpu_num, exclusive, memory, utilization): available_gpu_list = [] if self.valid and self.can_use: for gpu in self.gpus.all(): if gpu.check_available(exclusive, memory, utilization): available_gpu_list.append(gpu.index) if len(available_gpu_list) >= gpu_num: return available_gpu_list else: return None else: return None def set_gpus_busy(self, gpu_list): self.gpus.filter(index__in=gpu_list).update(use_by_self=True) def set_gpus_free(self, gpu_list): self.gpus.filter(index__in=gpu_list).update(use_by_self=False) class GPUInfo(models.Model): uuid = models.CharField('UUID', max_length=40, primary_key=True) index = models.PositiveSmallIntegerField('序号') name = models.CharField('名称', max_length=40) utilization = models.PositiveSmallIntegerField('利用率') memory_total = models.PositiveIntegerField('总显存') memory_used = models.PositiveIntegerField('已用显存') processes = models.TextField('进程') server = models.ForeignKey(GPUServer, verbose_name='服务器', on_delete=models.CASCADE, related_name='gpus') use_by_self = models.BooleanField('是否被gputasker进程占用', default=False) complete_free = models.BooleanField('完全空闲', default=False) update_at = models.DateTimeField('更新时间', auto_now=True) class Meta: ordering = ('server', 'index',) verbose_name = 'GPU信息' verbose_name_plural = 'GPU信息' def __str__(self): return self.name + '[' + str(self.index) + '-' + self.server.ip + ']' @property def memory_available(self): return self.memory_total - self.memory_used @property def utilization_available(self): return 100 - self.utilization def check_available(self, exclusive, memory, utilization): if exclusive: return not self.use_by_self and self.complete_free else: return not self.use_by_self and self.memory_available > memory and self.utilization_available > utilization def usernames(self): r""" convert processes string to usernames string array. :return: string array of usernames. """ if self.processes != '': arr = self.processes.split('\n') # only show first two usernames username_arr = [json.loads(item)['username'] for item in arr[:2]] res = ', '.join(username_arr) # others use ... to note if len(arr) > 2: res = res + ', ...' return res else: return '-' ================================================ FILE: gpu_info/tests.py ================================================ from django.test import TestCase # Create your tests here. ================================================ FILE: gpu_info/utils.py ================================================ import os import subprocess import json import logging from .models import GPUServer, GPUInfo task_logger = logging.getLogger('django.task') def ssh_execute(host, user, exec_cmd, port=22, private_key_path=None): exec_cmd = exec_cmd.replace('\r\n', '\n').replace('$', '\\$') if exec_cmd[-1] != '\n': exec_cmd = exec_cmd + '\n' if private_key_path is None: cmd = "ssh -o StrictHostKeyChecking=no -p {:d} {}@{} \"{}\"".format(port, user, host, exec_cmd) else: cmd = "ssh -o StrictHostKeyChecking=no -p {:d} -i {} {}@{} \"{}\"".format(port, private_key_path, user, host, exec_cmd) return subprocess.check_output(cmd, timeout=60, shell=True) def get_hostname(host, user, port=22, private_key_path=None): cmd = "hostname" return str(ssh_execute( host, user, cmd, port, private_key_path ).replace(b'\n', b'')).replace('b\'', '').replace('\'', '') def add_hostname(server, user, private_key_path=None): hostname = get_hostname(server.ip, user, server.port, private_key_path) server.hostname = hostname server.save() def get_gpu_status(host, user, port=22, private_key_path=None): gpu_info_list = [] query_gpu_cmd = 'nvidia-smi --query-gpu=uuid,gpu_name,utilization.gpu,memory.total,memory.used --format=csv | grep -v \'uuid\'' gpu_info_raw = ssh_execute(host, user, query_gpu_cmd, port, private_key_path).decode('utf-8') if gpu_info_raw.find('Error') != -1: raise RuntimeError(gpu_info_raw) gpu_info_dict = {} for index, gpu_info_line in enumerate(gpu_info_raw.split('\n')): try: gpu_info_items = gpu_info_line.split(',') gpu_info = {} gpu_info['index'] = index gpu_info['uuid'] = gpu_info_items[0].strip() gpu_info['name'] = gpu_info_items[1].strip() gpu_info['utilization.gpu'] = int(gpu_info_items[2].strip().split(' ')[0]) gpu_info['memory.total'] = int(gpu_info_items[3].strip().split(' ')[0]) gpu_info['memory.used'] = int(gpu_info_items[4].strip().split(' ')[0]) gpu_info['processes'] = [] gpu_info_list.append(gpu_info) gpu_info_dict[gpu_info['uuid']] = gpu_info except Exception: continue pid_set = set([]) if len(gpu_info_list) != 0: query_apps_cmd = 'nvidia-smi --query-compute-apps=gpu_uuid,pid,process_name,used_memory --format=csv' app_info_raw = ssh_execute(host, user, query_apps_cmd, port, private_key_path).decode('utf-8') for app_info_line in app_info_raw.split('\n')[1:]: try: app_info_items = app_info_line.split(',') app_info = {} uuid = app_info_items[0].strip() app_info['pid'] = int(app_info_items[1].strip()) app_info['command'] = app_info_items[2].strip() app_info['gpu_memory_usage'] = int(app_info_items[3].strip().split(' ')[0]) if app_info['gpu_memory_usage'] != 0: gpu_info_dict[uuid]['processes'].append(app_info) pid_set.add(app_info['pid']) except Exception: continue pid_username_dict = {} if len(pid_set) != 0: query_pid_cmd = 'ps -o ruser=userForLongName -o pid -p ' + ' '.join(map(str, pid_set)) + ' | awk \'{print $1, $2}\' | grep -v \'PID\'' pid_raw = ssh_execute(host, user, query_pid_cmd, port, private_key_path).decode('utf-8') for pid_line in pid_raw.split('\n'): try: username, pid = pid_line.split(' ') pid = int(pid.strip()) pid_username_dict[pid] = username.strip() except Exception: continue for gpu_info in gpu_info_list: for process in gpu_info['processes']: process['username'] = pid_username_dict.get(process['pid'], '') return gpu_info_list class GPUInfoUpdater: def __init__(self, user, private_key_path=None): self.user = user self.private_key_path = private_key_path self.utilization_history = {} def update_utilization(self, uuid, utilization): if self.utilization_history.get(uuid) is None: self.utilization_history[uuid] = [utilization] return utilization else: self.utilization_history[uuid].append(utilization) if len(self.utilization_history[uuid]) > 10: self.utilization_history[uuid].pop(0) return max(self.utilization_history[uuid]) def update_gpu_info(self): server_list = GPUServer.objects.all() for server in server_list: try: if server.hostname is None or server.hostname == '': add_hostname(server, self.user, self.private_key_path) gpu_info_json = get_gpu_status(server.ip, self.user, server.port, self.private_key_path) if not server.valid: server.valid = True server.save() for gpu in gpu_info_json: if GPUInfo.objects.filter(uuid=gpu['uuid']).count() == 0: gpu_info = GPUInfo( uuid=gpu['uuid'], name=gpu['name'], index=gpu['index'], utilization=self.update_utilization(gpu['uuid'], gpu['utilization.gpu']), memory_total=gpu['memory.total'], memory_used=gpu['memory.used'], processes='\n'.join(map(lambda x: json.dumps(x), gpu['processes'])), complete_free=len(gpu['processes']) == 0, server=server ) gpu_info.save() else: gpu_info = GPUInfo.objects.get(uuid=gpu['uuid']) gpu_info.utilization = self.update_utilization(gpu['uuid'], gpu['utilization.gpu']) gpu_info.memory_total = gpu['memory.total'] gpu_info.memory_used = gpu['memory.used'] gpu_info.complete_free = len(gpu['processes']) == 0 gpu_info.processes = '\n'.join(map(lambda x: json.dumps(x), gpu['processes'])) gpu_info.save() except (subprocess.CalledProcessError, subprocess.TimeoutExpired, RuntimeError): task_logger.error('Update ' + server.ip + ' failed') server.valid = False server.save() ================================================ FILE: gpu_info/views.py ================================================ from django.shortcuts import render ================================================ FILE: gpu_tasker/.gitignore ================================================ email_settings.py ================================================ FILE: gpu_tasker/__init__.py ================================================ ================================================ FILE: gpu_tasker/asgi.py ================================================ """ ASGI config for gpu_tasker project. It exposes the ASGI callable as a module-level variable named ``application``. For more information on this file, see https://docs.djangoproject.com/en/3.1/howto/deployment/asgi/ """ import os from django.core.asgi import get_asgi_application os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gpu_tasker.settings') application = get_asgi_application() ================================================ FILE: gpu_tasker/email_settings_sample.py ================================================ EMAIL_BACKEND = 'django.core.mail.backends.smtp.EmailBackend' EMAIL_HOST = 'smtp.163.com' EMAIL_PORT = 465 EMAIL_HOST_USER = 'xxx@163.com' EMAIL_HOST_PASSWORD = 'xxx' EMAIL_USE_SSL = True EMAIL_USE_LOCALTIME = True DEFAULT_FROM_EMAIL = 'GPUTasker<{}>'.format(EMAIL_HOST_USER) SERVER_EMAIL = EMAIL_HOST_USER ================================================ FILE: gpu_tasker/settings.py ================================================ """ Django settings for gpu_tasker project. Generated by 'django-admin startproject' using Django 3.1.3. For more information on this file, see https://docs.djangoproject.com/en/3.1/topics/settings/ For the full list of settings and their values, see https://docs.djangoproject.com/en/3.1/ref/settings/ """ import os from pathlib import Path # Build paths inside the project like this: BASE_DIR / 'subdir'. BASE_DIR = Path(__file__).resolve().parent.parent # Quick-start development settings - unsuitable for production # See https://docs.djangoproject.com/en/3.1/howto/deployment/checklist/ # SECURITY WARNING: keep the secret key used in production secret! SECRET_KEY = '&e$9q=9ak)%8l)^w_n89ip4m5d27^@32$vy#&k53q%g^$)y)rh' # SECURITY WARNING: don't run with debug turned on in production! DEBUG = False ALLOWED_HOSTS = ['*'] # Application definition INSTALLED_APPS = [ 'simpleui', 'django.contrib.admin', 'django.contrib.auth', 'django.contrib.contenttypes', 'django.contrib.sessions', 'django.contrib.messages', 'django.contrib.staticfiles', 'base', 'gpu_info', 'task' ] MIDDLEWARE = [ 'django.middleware.security.SecurityMiddleware', 'django.contrib.sessions.middleware.SessionMiddleware', 'django.middleware.common.CommonMiddleware', 'django.middleware.csrf.CsrfViewMiddleware', 'django.contrib.auth.middleware.AuthenticationMiddleware', 'django.contrib.messages.middleware.MessageMiddleware', 'django.middleware.clickjacking.XFrameOptionsMiddleware', ] ROOT_URLCONF = 'gpu_tasker.urls' TEMPLATES = [ { 'BACKEND': 'django.template.backends.django.DjangoTemplates', 'DIRS': [], 'APP_DIRS': True, 'OPTIONS': { 'context_processors': [ 'django.template.context_processors.debug', 'django.template.context_processors.request', 'django.contrib.auth.context_processors.auth', 'django.contrib.messages.context_processors.messages', ], }, }, ] WSGI_APPLICATION = 'gpu_tasker.wsgi.application' # Database # https://docs.djangoproject.com/en/3.1/ref/settings/#databases if os.getenv('DOCKER_DEPLOY') is None: DATABASES = { 'default': { 'ENGINE': 'django.db.backends.sqlite3', 'NAME': BASE_DIR / 'db.sqlite3', } } else: DATABASES = { 'default': { 'ENGINE': 'django.db.backends.mysql', 'NAME': 'gpu_tasker', 'USER': 'gpu_tasker', 'PASSWORD': 'gpu_tasker', 'HOST': 'mariadb', 'PORT': '3306', } } # Password validation # https://docs.djangoproject.com/en/3.1/ref/settings/#auth-password-validators AUTH_PASSWORD_VALIDATORS = [ { 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', }, { 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', }, { 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', }, { 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', }, ] # Internationalization # https://docs.djangoproject.com/en/3.1/topics/i18n/ LANGUAGE_CODE = 'zh-Hans' TIME_ZONE = 'Asia/Shanghai' USE_I18N = True USE_L10N = True USE_TZ = False # Static files (CSS, JavaScript, Images) # https://docs.djangoproject.com/en/3.1/howto/static-files/ STATIC_URL = '/static/' STATICFILES_DIRS = [ os.path.join(BASE_DIR, 'static') ] STATIC_ROOT = 'static_collected' SIMPLEUI_ANALYSIS = False SIMPLEUI_HOME_INFO = False RUNNING_LOG_DIR = 'running_log' PRIVATE_KEY_DIR = 'private_key' if not os.path.isdir(RUNNING_LOG_DIR): os.makedirs(RUNNING_LOG_DIR) if not os.path.isdir(PRIVATE_KEY_DIR): os.makedirs(PRIVATE_KEY_DIR) SERVER_LOG_DIR = os.path.join(BASE_DIR, 'server_log') if not os.path.isdir(SERVER_LOG_DIR): os.makedirs(SERVER_LOG_DIR) LOGGING = { 'version': 1, 'disable_existing_loggers': True, 'formatters': { 'standard': { 'format': '%(asctime)s [%(threadName)s:%(thread)d] [%(name)s:%(lineno)d] [%(module)s:%(funcName)s] [%(levelname)s]- %(message)s'}, }, 'filters': { 'require_debug_true': { '()': 'django.utils.log.RequireDebugTrue', }, 'require_debug_false': { '()': 'django.utils.log.RequireDebugFalse', } }, 'handlers': { 'null': { 'level': 'DEBUG', 'class': 'logging.NullHandler', }, 'console': { 'level': 'DEBUG', 'class': 'logging.StreamHandler', 'formatter': 'standard' }, 'req_err': { 'level': 'ERROR', 'class': 'logging.handlers.RotatingFileHandler', 'filename': os.path.join(SERVER_LOG_DIR, 'request_error.log'), 'maxBytes': 1024 * 1024 * 5, 'backupCount': 5, 'formatter': 'standard', }, 'req_info': { 'level': 'INFO', 'class': 'logging.handlers.RotatingFileHandler', 'filename': os.path.join(SERVER_LOG_DIR, 'request_info.log'), 'maxBytes': 1024 * 1024 * 5, 'backupCount': 5, 'formatter': 'standard', }, 'info_log': { 'level': 'INFO', 'class': 'logging.handlers.RotatingFileHandler', 'filename': os.path.join(SERVER_LOG_DIR, 'task_info.log'), 'maxBytes': 1024 * 1024 * 5, 'backupCount': 5, 'formatter': 'standard', }, }, 'loggers': { # logging管理器 'django': { 'handlers': ['req_info'], 'level': 'INFO', 'propagate': False }, 'django.request': { 'handlers': ['req_err'], 'level': 'ERROR', 'propagate': False, }, 'django.task': { 'handlers': ['info_log', 'console'], 'level': 'INFO', 'propagate': False, }, 'django.security.DisallowedHost': { 'handlers': ['null'], 'propagate': False, }, } } try: from gpu_tasker.email_settings import * EMAIL_NOTIFICATION = True except ModuleNotFoundError: EMAIL_NOTIFICATION = False ================================================ FILE: gpu_tasker/urls.py ================================================ """gpu_tasker URL Configuration The `urlpatterns` list routes URLs to views. For more information please see: https://docs.djangoproject.com/en/3.1/topics/http/urls/ Examples: Function views 1. Add an import: from my_app import views 2. Add a URL to urlpatterns: path('', views.home, name='home') Class-based views 1. Add an import: from other_app.views import Home 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') Including another URLconf 1. Import the include() function: from django.urls import include, path 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) """ from django.contrib import admin from django.urls import path from django.shortcuts import redirect admin.site.site_header = 'GPU任务管理平台' admin.site.site_title = 'GPU任务管理平台' def index_view(request): return redirect('/admin') urlpatterns = [ path('admin/', admin.site.urls), path('', index_view) ] ================================================ FILE: gpu_tasker/wsgi.py ================================================ """ WSGI config for gpu_tasker project. It exposes the WSGI callable as a module-level variable named ``application``. For more information on this file, see https://docs.djangoproject.com/en/3.1/howto/deployment/wsgi/ """ import os from django.core.wsgi import get_wsgi_application os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gpu_tasker.settings') application = get_wsgi_application() ================================================ FILE: main.py ================================================ import os import time import threading import logging import django os.environ.setdefault("DJANGO_SETTINGS_MODULE", "gpu_tasker.settings") django.setup() from base.utils import get_admin_config from task.models import GPUTask from task.utils import run_task from gpu_info.utils import GPUInfoUpdater task_logger = logging.getLogger('django.task') if __name__ == '__main__': while True: start_time = time.time() try: server_username, server_private_key_path = get_admin_config() gpu_updater = GPUInfoUpdater(server_username, server_private_key_path) task_logger.info('Running processes: {:d}'.format( threading.active_count() - 1 )) gpu_updater.update_gpu_info() for task in GPUTask.objects.filter(status=0): available_server = task.find_available_server() if available_server is not None: t = threading.Thread(target=run_task, args=(task, available_server)) t.start() time.sleep(5) except Exception as e: task_logger.error(str(e)) finally: end_time = time.time() # 确保至少间隔十秒,减少服务器负担 duration = end_time - start_time if duration < 10: time.sleep(10 - duration) ================================================ FILE: manage.py ================================================ #!/usr/bin/env python """Django's command-line utility for administrative tasks.""" import os import sys def main(): """Run administrative tasks.""" os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'gpu_tasker.settings') try: from django.core.management import execute_from_command_line except ImportError as exc: raise ImportError( "Couldn't import Django. Are you sure it's installed and " "available on your PYTHONPATH environment variable? Did you " "forget to activate a virtual environment?" ) from exc execute_from_command_line(sys.argv) if __name__ == '__main__': main() ================================================ FILE: nginx/conf.d/gpu_tasker.conf ================================================ server { listen 80; access_log /var/log/nginx/access.log main; charset utf-8; gzip_types text/plain application/x-javascript text/css text/javascript application/x-httpd-php application/json text/json image/jpeg image/gif image/png application/octet-stream; error_page 404 /404.html; error_page 500 502 503 504 /50x.html; location / { include uwsgi_params; uwsgi_connect_timeout 30; uwsgi_pass gputasker:9009; } location /static { alias /static_collected; } } ================================================ FILE: notification/__init__.py ================================================ ================================================ FILE: notification/email_notification.py ================================================ import traceback from django.core.mail import send_mail from gpu_tasker.settings import EMAIL_NOTIFICATION TASK_START_NOTIFICATION_TITLE = '任务开始运行' TASK_START_NOTIFICATION_TEMPLATE = \ '''任务[{}]开始运行 任务运行详情: 任务名称:{} 工作目录:{} 命令: ---------- {} ---------- 服务器:{} 显卡:{} 开始时间:{} ''' TASK_FINISH_NOTIFICATION_TITLE = '任务运行完成' TASK_FINISH_NOTIFICATION_TEMPLATE = \ '''任务[{}]运行完成 任务运行详情: 任务名称:{} 工作目录:{} 命令: ---------- {} ---------- 服务器:{} 显卡:{} 结束时间:{} 请登录GPUTasker查看运行结果 ''' TASK_FAIL_NOTIFICATION_TITLE = '任务运行失败' TASK_FAIL_NOTIFICATION_TEMPLATE = \ '''任务[{}]运行失败 任务运行详情: 任务名称:{} 工作目录:{} 命令: ---------- {} ---------- 服务器:{} 显卡:{} 结束时间:{} 请登录GPUTasker查看错误信息 ''' def send_email(address, title, content): if EMAIL_NOTIFICATION: try: from gpu_tasker.settings import DEFAULT_FROM_EMAIL send_mail(title, content, DEFAULT_FROM_EMAIL, [address], fail_silently=False) except Exception: es = traceback.format_exc() print('Send email fail') print(es) def check_email_config(func): def wrapper(*args, **kw): if EMAIL_NOTIFICATION: running_log = args[0] address = running_log.task.user.email if address is not None and address != '': return func(*args, **kw) return wrapper @check_email_config def send_task_start_email(running_log): address = running_log.task.user.email title = TASK_START_NOTIFICATION_TITLE content = TASK_START_NOTIFICATION_TEMPLATE.format( running_log.task.name, running_log.task.name, running_log.task.workspace, running_log.task.cmd, running_log.server.ip, running_log.gpus, running_log.start_at.strftime("%Y-%m-%d %H:%M:%S") ) send_email(address, title, content) @check_email_config def send_task_finish_email(running_log): address = running_log.task.user.email title = TASK_FINISH_NOTIFICATION_TITLE content = TASK_FINISH_NOTIFICATION_TEMPLATE.format( running_log.task.name, running_log.task.name, running_log.task.workspace, running_log.task.cmd, running_log.server.ip, running_log.gpus, running_log.update_at.strftime("%Y-%m-%d %H:%M:%S") ) send_email(address, title, content) @check_email_config def send_task_fail_email(running_log): address = running_log.task.user.email title = TASK_FAIL_NOTIFICATION_TITLE content = TASK_FAIL_NOTIFICATION_TEMPLATE.format( running_log.task.name, running_log.task.name, running_log.task.workspace, running_log.task.cmd, running_log.server.ip, running_log.gpus, running_log.update_at.strftime("%Y-%m-%d %H:%M:%S") ) send_email(address, title, content) ================================================ FILE: static/css/admin/custom.css ================================================ .inline-group .tabular td.original p { position: relative; margin-top: 1rem!important; width: 100%; font-size: 0; overflow: visible; } .inline-group .tabular td.original p a{ font-size: 10px; padding-left: 10px; } ================================================ FILE: task/__init__.py ================================================ default_app_config = 'task.apps.TaskConfig' ================================================ FILE: task/admin.py ================================================ from django.contrib import admin from django.utils.html import format_html from .models import GPUTask, GPUTaskRunningLog class GPUTaskRunningLogInline(admin.TabularInline): model = GPUTaskRunningLog fields = ('index', 'server', 'gpus', 'log_file_path', 'color_status', 'start_at', 'update_at',) readonly_fields = ('index', 'server', 'gpus', 'log_file_path', 'color_status', 'start_at', 'update_at',) show_change_link = True verbose_name = '运行记录' verbose_name_plural = '运行记录' def get_extra(self, request, obj, **kwargs): return 0 def has_add_permission(self, request, obj): return False def has_change_permission(self, request, obj): return False def color_status(self, obj): if obj.status == -1: status = '运行失败' color_code = 'red' elif obj.status == 1: status = '运行中' color_code = '#ecc849' elif obj.status == 2: status = '已完成' color_code = 'green' else: status = '未知状态' color_code = 'red' return format_html('{}', color_code, status) color_status.short_description = '状态' color_status.admin_order_field = 'status' @admin.register(GPUTask) class GPUTaskAdmin(admin.ModelAdmin): list_display = ('id', 'name', 'workspace', 'cmd', 'gpu_requirement', 'exclusive_gpu', 'memory_requirement', 'utilization_requirement', 'assign_server', 'priority', 'color_status', 'create_at', 'update_at',) list_filter = ('gpu_requirement', 'status', 'assign_server', 'priority') search_fields = ('name', 'status',) list_display_links = ('name',) readonly_fields = ('create_at', 'update_at', 'user',) inlines = (GPUTaskRunningLogInline,) actions = ('copy_task', 'restart_task',) class Media: # custom css css = { 'all': ('css/admin/custom.css', ) } def get_queryset(self, request): qs = super().get_queryset(request) if request.user.is_superuser: return qs return qs.filter(user=request.user) def has_add_permission(self, request): return True def save_model(self, request, obj, form, change): if not change: obj.user = request.user # format cmd obj.cmd = obj.cmd.replace('\r\n', '\n') if obj.cmd[-1] != '\n': obj.cmd = obj.cmd + '\n' super().save_model(request, obj, form, change) def color_status(self, obj): if obj.status == -2: status = '未就绪' color_code = 'gray' elif obj.status == -1: status = '运行失败' color_code = 'red' elif obj.status == 0: status = '准备就绪' color_code = 'blue' elif obj.status == 1: status = '运行中' color_code = '#ecc849' elif obj.status == 2: status = '已完成' color_code = 'green' else: status = '未知状态' color_code = 'red' return format_html('{}', color_code, status) color_status.short_description = '状态' color_status.admin_order_field = 'status' def delete_queryset(self, request, queryset): for task in queryset: for running_task in task.task_logs.all(): running_task.delete_log_file() task.delete() def copy_task(self, request, queryset): for task in queryset: new_task = GPUTask( name=task.name + '_copy', user=task.user, workspace=task.workspace, cmd=task.cmd, exclusive_gpu=task.exclusive_gpu, gpu_requirement=task.gpu_requirement, memory_requirement=task.memory_requirement, utilization_requirement=task.utilization_requirement, assign_server=task.assign_server, priority=task.priority, status=-2 ) new_task.save() copy_task.short_description = '复制任务' copy_task.icon = 'el-icon-document-copy' copy_task.type = 'success' def restart_task(self, request, queryset): for task in queryset: task.status = 0 task.save() restart_task.short_description = '重新开始' restart_task.icon = 'el-icon-refresh-left' restart_task.type = 'success' @admin.register(GPUTaskRunningLog) class GPUTaskRunningLogAdmin(admin.ModelAdmin): list_display = ('id', 'index', 'task', 'server', 'gpus', 'log_file_path', 'color_status', 'start_at', 'update_at',) list_filter = ('task', 'server', 'status') search_fields = ('task', 'server',) list_display_links = ('task',) readonly_fields = ('start_at', 'update_at', 'log', 'task', 'index', 'server', 'gpus', 'status', 'log_file_path', 'pid') fieldsets = ( ('基本信息', {'fields': ['task', 'index', 'server', 'gpus', 'pid']}), ('状态信息', {'fields': ['status', 'start_at', 'update_at']}), ('日志', {'fields': ['log_file_path', 'log']}) ) actions = ('kill_button',) def get_queryset(self, request): qs = super().get_queryset(request) if request.user.is_superuser: return qs return qs.filter(task__user=request.user) def has_add_permission(self, request): return False def delete_queryset(self, request, queryset): for running_task in queryset: running_task.delete_log_file() running_task.delete() def color_status(self, obj): if obj.status == -1: status = '运行失败' color_code = 'red' elif obj.status == 1: status = '运行中' color_code = '#ecc849' elif obj.status == 2: status = '已完成' color_code = 'green' else: status = '未知状态' color_code = 'red' return format_html('{}', color_code, status) color_status.short_description = '状态' color_status.admin_order_field = 'status' def log(self, obj): try: with open(obj.log_file_path, 'r') as f: return f.read() except Exception: return 'Error: Cannot open log file' log.short_description = '日志' def kill_button(self, request, queryset): for running_task in queryset: if running_task.status == 1: running_task.kill() kill_button.short_description = '结束进程' kill_button.icon = 'el-icon-error' kill_button.type = 'danger' kill_button.confirm = '是否执意结束选中进程?' ================================================ FILE: task/apps.py ================================================ from django.apps import AppConfig class TaskConfig(AppConfig): name = 'task' verbose_name = 'GPU任务管理' ================================================ FILE: task/migrations/__init__.py ================================================ ================================================ FILE: task/models.py ================================================ import os import signal from django.db import models from django.core.validators import MaxValueValidator, MinValueValidator from gpu_info.models import GPUServer, GPUInfo from django.contrib.auth.models import User class GPUTask(models.Model): STATUS_CHOICE = ( (-2, '未就绪'), (-1, '运行失败'), (0, '准备就绪'), (1, '运行中'), (2, '已完成'), ) name = models.CharField('任务名称', max_length=100) user = models.ForeignKey(User, verbose_name='用户', on_delete=models.CASCADE, related_name='tasks') workspace = models.CharField('工作目录', max_length=200) cmd = models.TextField('命令') gpu_requirement = models.PositiveSmallIntegerField( 'GPU数量需求', default=1, validators=[MaxValueValidator(8), MinValueValidator(0)] ) exclusive_gpu = models.BooleanField('独占显卡', default=False) memory_requirement = models.PositiveSmallIntegerField('显存需求(MB)', default=0) utilization_requirement = models.PositiveSmallIntegerField('利用率需求(%)', default=0) assign_server = models.ForeignKey(GPUServer, verbose_name='指定服务器', on_delete=models.SET_NULL, blank=True, null=True) priority = models.SmallIntegerField('优先级', default=0) status = models.SmallIntegerField('状态', choices=STATUS_CHOICE, default=0) create_at = models.DateTimeField('创建时间', auto_now_add=True) update_at = models.DateTimeField('更新时间', auto_now=True) class Meta: verbose_name = 'GPU任务' verbose_name_plural = 'GPU任务' def __str__(self): return self.name def find_available_server(self): # TODO(Yuhao Wang): 优化算法,找最优server available_server = None if self.assign_server is None: for server in GPUServer.objects.all(): available_gpus = server.get_available_gpus( self.gpu_requirement, self.exclusive_gpu, self.memory_requirement, self.utilization_requirement ) if available_gpus is not None: available_server = { 'server': server, 'gpus': available_gpus[:self.gpu_requirement] } break else: available_gpus = self.assign_server.get_available_gpus( self.gpu_requirement, self.exclusive_gpu, self.memory_requirement, self.utilization_requirement ) if available_gpus is not None: available_server = { 'server': self.assign_server, 'gpus': available_gpus[:self.gpu_requirement] } return available_server class GPUTaskRunningLog(models.Model): STATUS_CHOICE = ( (-1, '运行失败'), (1, '运行中'), (2, '已完成'), ) index = models.PositiveSmallIntegerField('序号') task = models.ForeignKey(GPUTask, verbose_name='任务', on_delete=models.CASCADE, related_name='task_logs') server = models.ForeignKey(GPUServer, verbose_name='服务器', on_delete=models.SET_NULL, related_name='task_logs', null=True) pid = models.IntegerField('PID') gpus = models.CharField('GPU', max_length=20) log_file_path = models.FilePathField(path='running_log', match='.*\.log$', verbose_name="日志文件") status = models.SmallIntegerField('状态', choices=STATUS_CHOICE, default=1) start_at = models.DateTimeField('开始时间', auto_now_add=True) update_at = models.DateTimeField('更新时间', auto_now=True) class Meta: ordering = ('-id',) verbose_name = 'GPU任务运行记录' verbose_name_plural = 'GPU任务运行记录' def __str__(self): return self.task.name + '-' + str(self.index) def kill(self): os.kill(self.pid, signal.SIGKILL) def delete_log_file(self): if os.path.isfile(self.log_file_path): os.remove(self.log_file_path) ================================================ FILE: task/tests.py ================================================ from django.test import TestCase # Create your tests here. ================================================ FILE: task/utils.py ================================================ import os import signal import subprocess import json import time import traceback import logging from gpu_tasker.settings import RUNNING_LOG_DIR from .models import GPUTask, GPUTaskRunningLog from notification.email_notification import \ send_task_start_email, send_task_finish_email, send_task_fail_email task_logger = logging.getLogger('django.task') def generate_ssh_cmd(host, user, exec_cmd, port=22, private_key_path=None): exec_cmd = exec_cmd.replace('$', '\\$') if private_key_path is None: cmd = "ssh -o StrictHostKeyChecking=no -p {:d} {}@{} \"{}\"".format(port, user, host, exec_cmd) else: cmd = "ssh -o StrictHostKeyChecking=no -p {:d} -i {} {}@{} \"{}\"".format(port, private_key_path, user, host, exec_cmd) return cmd class RemoteProcess: def __init__(self, user, host, cmd, workspace="~", port=22, private_key_path=None, output_file=None): self.cmd = generate_ssh_cmd(host, user, "cd {} && {}".format(workspace, cmd), port, private_key_path) task_logger.info('cmd:\n' + self.cmd) if output_file is not None: self.output_file = output_file with open(self.output_file, "wb") as out: self.proc = subprocess.Popen(self.cmd, shell=True, stdout=out, stderr=out, bufsize=1) else: self.proc = subprocess.Popen(self.cmd, shell=True) def pid(self): return self.proc.pid def kill(self): # os.killpg(os.getpgid(self.proc.pid), signal.SIGKILL) os.kill(self.proc.pid, signal.SIGKILL) def get_return_code(self): self.proc.wait() return self.proc.returncode class RemoteGPUProcess(RemoteProcess): def __init__(self, user, host, gpus, cmd, workspace="~", port=22, private_key_path=None, output_file=None): env = 'export CUDA_VISIBLE_DEVICES={}'.format(','.join(map(str, gpus))) cmd = 'bash -c \'{}\n{}\n\''.format(env, cmd) super(RemoteGPUProcess, self).__init__(user, host, cmd, workspace, port, private_key_path, output_file) def run_task(task, available_server): server = available_server['server'] gpus = available_server['gpus'] index = task.task_logs.all().count() log_file_path = os.path.join( RUNNING_LOG_DIR, '{:d}_{:s}_{:s}_{:d}_{:d}.log'.format(task.id, task.name, server.ip, index, int(time.time())) ) # create running_log running_log = GPUTaskRunningLog( index=index, task=task, server=server, pid=-1, gpus=','.join(map(str, gpus)), log_file_path=log_file_path, status=1 ) running_log.save() try: # run process process = RemoteGPUProcess( task.user.config.server_username, server.ip, gpus, task.cmd, task.workspace, server.port, task.user.config.server_private_key_path, log_file_path ) pid = process.pid() task_logger.info('Task {:d}-{:s} is running, pid: {:d}'.format(task.id, task.name, pid)) # save process status running_log.pid = pid running_log.save() server.set_gpus_busy(gpus) server.save() task.status = 1 task.save() # send email send_task_start_email(running_log) # wait for return return_code = process.get_return_code() task_logger.info('Task {:d}-{:s} stopped, return_code: {:d}'.format(task.id, task.name, return_code)) # save process status running_log.status = 2 if return_code == 0 else -1 running_log.save() task.status = 2 if return_code == 0 else -1 task.save() # send email if return_code == 0: send_task_finish_email(running_log) else: send_task_fail_email(running_log) except Exception: es = traceback.format_exc() task_logger.error(es) running_log.status = -1 running_log.save() task.status = -1 task.save() with open(log_file_path, 'a') as f: f.write('\n') f.write(es) finally: server.set_gpus_free(gpus) server.save() ================================================ FILE: task/views.py ================================================ from django.shortcuts import render # Create your views here. ================================================ FILE: uwsgi/uwsgi.ini ================================================ [uwsgi] chdir=/gpu_tasker module=gpu_tasker.wsgi:application socket=:9009 workers=5 pidfile=/gpu_tasker/uwsgi/uwsgi.pid uid=root gid=root master=true vacuum=true thunder-lock=true enable-threads=true harakiri=30 post-buffering=4096 daemonize=/gpu_tasker/uwsgi/log/uwsgi.log py-autoreload=1