Repository: aovoc/nnieqat-pytorch Branch: master Commit: 91410cf331a1 Files: 36 Total size: 75.0 KB Directory structure: gitextract_yis4nxki/ ├── LICENSE.txt ├── MANIFEST.in ├── Makefile ├── README.md ├── build_helper.py ├── docker/ │ └── Dockerfile ├── docs/ │ ├── Makefile │ ├── make.bat │ └── source/ │ ├── build_helper.rst │ ├── conf.py │ ├── index.rst │ ├── modules.rst │ ├── nnieqat.cuda10.rst │ ├── nnieqat.modules.rst │ ├── nnieqat.rst │ └── setup.rst ├── nnieqat/ │ ├── __init__.py │ ├── cuda10/ │ │ ├── LICENSE.txt │ │ └── lib/ │ │ ├── gfpq.lib │ │ ├── libgfpq.a │ │ ├── libgfpq.so.1.1.5 │ │ ├── libgfpq_gpu.a │ │ └── libgfpq_gpu.so.1.1.5 │ └── quantize.py ├── pyproject.toml ├── setup.cfg ├── setup.py ├── src/ │ ├── fake_quantize.cpp │ ├── fake_quantize.cu │ ├── fake_quantize.h │ └── test/ │ ├── Makefile │ └── test.cu └── tests/ ├── test_cifar10.py ├── test_imagenet.py ├── test_merge_freeze_bn.py └── test_quant_impl.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: LICENSE.txt ================================================ MIT License Copyright (c) Minqin Chen Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: MANIFEST.in ================================================ ================================================ FILE: Makefile ================================================ # Uncomment for debugging # DEBUG := 1 # Pretty build # Q ?= @ CXX := g++ python := python3 PYTHON_HEADER_DIR := $(shell python -c 'from distutils.sysconfig import get_python_inc; print(get_python_inc())') PYTORCH_INCLUDES := $(shell python -c 'from torch.utils.cpp_extension import include_paths; [print(p) for p in include_paths()]') PYTORCH_LIBRARIES := $(shell python -c 'from torch.utils.cpp_extension import library_paths; [print(p) for p in library_paths()]') CUDA_DIR := $(shell python -c 'from torch.utils.cpp_extension import _find_cuda_home; print(_find_cuda_home())') WITH_ABI := $(shell python -c 'import torch; print(int(torch._C._GLIBCXX_USE_CXX11_ABI))') INCLUDE_DIRS := ./ $(CUDA_DIR)/include INCLUDE_DIRS += $(PYTHON_HEADER_DIR) INCLUDE_DIRS += $(PYTORCH_INCLUDES) # Custom (MKL/ATLAS/OpenBLAS) include and lib directories. # BLAS_INCLUDE := /path/to/your/blas # BLAS_LIB := /path/to/your/blas SRC_DIR := ./src OBJ_DIR := ./obj CPP_SRCS := $(wildcard $(SRC_DIR)/*.cpp) CU_SRCS := $(wildcard $(SRC_DIR)/*.cu) OBJS := $(patsubst $(SRC_DIR)/%.cpp,$(OBJ_DIR)/%.o,$(CPP_SRCS)) CU_OBJS := $(patsubst $(SRC_DIR)/%.cu,$(OBJ_DIR)/cuda/%.o,$(CU_SRCS)) STATIC_LIB := $(OBJ_DIR)/libquant_impl.a CUDA_ARCH := -gencode arch=compute_50,code=sm_50 \ -gencode arch=compute_52,code=sm_52 \ -gencode arch=compute_60,code=sm_60 \ -gencode arch=compute_61,code=sm_61 \ -gencode arch=compute_70,code=sm_70 \ -gencode arch=compute_75,code=sm_75 \ -gencode arch=compute_75,code=compute_75 LIBRARIES += stdc++ cudart c10 caffe2 torch torch_python caffe2_gpu ifeq ($(DEBUG), 1) COMMON_FLAGS += -DDEBUG -g -O0 NVCCFLAGS += -g -G # -rdc true else COMMON_FLAGS += -DNDEBUG -O3 endif WARNINGS := -Wall -Wno-sign-compare -Wcomment INCLUDE_DIRS += $(BLAS_INCLUDE) CXXFLAGS += -MMD -MP COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) \ -DTORCH_API_INCLUDE_EXTENSION_H -D_GLIBCXX_USE_CXX11_ABI=$(WITH_ABI) CXXFLAGS += -pthread -fPIC -fwrapv -std=c++14 $(COMMON_FLAGS) $(WARNINGS) NVCCFLAGS += -std=c++14 -ccbin=$(CXX) -Xcompiler -fPIC -use_fast_math $(COMMON_FLAGS) default: $(STATIC_LIB) $(OBJ_DIR): @ mkdir -p $@ @ mkdir -p $@/cuda $(OBJ_DIR)/%.o: $(SRC_DIR)/%.cpp | $(OBJ_DIR) @ echo CXX $< $(Q)$(CXX) $< $(CXXFLAGS) -c -o $@ $(OBJ_DIR)/cuda/%.o: $(SRC_DIR)/%.cu | $(OBJ_DIR) @ echo NVCC $< $(Q)nvcc $(NVCCFLAGS) $(CUDA_ARCH) -M $< -o ${@:.o=.d} \ -odir $(@D) $(Q)nvcc $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@ $(STATIC_LIB): $(OBJS) $(CU_OBJS) | $(OBJ_DIR) $(RM) -f $(STATIC_LIB) $(RM) -rf build dist @ echo LD -o $@ ar rc $(STATIC_LIB) $(OBJS) $(CU_OBJS) build: $(python) setup.py build upload: $(python) setup.py sdist bdist_wheel #twine upload dist/* clean: $(RM) -rf build dist nnieqat.egg-info test: nosetests -s tests/test_quant_impl.py --nologcapture nosetests -s tests/test_merge_freeze_bn.py --nologcapture lint: pylint nnieqat --reports=n lintfull: pylint nnieqat install: $(python) setup.py install uninstall: $(python) setup.py install --record install.log cat install.log | xargs rm -rf $(RM) install.log ================================================ FILE: README.md ================================================ # nnieqat-pytorch Nnieqat is a quantize aware training package for Neural Network Inference Engine(NNIE) on pytorch, it uses hisilicon quantization library to quantize module's weight and activation as fake fp32 format. ## Table of Contents - [nnieqat-pytorch](#nnieqat-pytorch) - [Table of Contents](#table-of-contents) - [Installation](#installation) - [Usage](#usage) - [Code Examples](#code-examples) - [Results](#results) - [Todo](#todo) - [Reference](#reference)
## Installation * Supported Platforms: Linux * Accelerators and GPUs: NVIDIA GPUs via CUDA driver ***10.1*** or ***10.2***. * Dependencies: * python >= 3.5, < 4 * llvmlite >= 0.31.0 * pytorch >= 1.5 * numba >= 0.42.0 * numpy >= 1.18.1 * Install nnieqat via pypi: ```shell $ pip install nnieqat ``` * Install nnieqat in docker(easy way to solve environment problems): ```shell $ cd docker $ docker build -t nnieqat-image . ``` * Install nnieqat via repo: ```shell $ git clone https://github.com/aovoc/nnieqat-pytorch $ cd nnieqat-pytorch $ make install ```
## Usage * add quantization hook. quantize and dequantize weight and data with HiSVP GFPQ library in forward() process. ```python from nnieqat import quant_dequant_weight, unquant_weight, merge_freeze_bn, register_quantization_hook ... ... register_quantization_hook(model) ... ``` * merge bn weight into conv and freeze bn suggest finetuning from a well-trained model, merge_freeze_bn at beginning. do it after a few epochs of training otherwise. ```python from nnieqat import quant_dequant_weight, unquant_weight, merge_freeze_bn, register_quantization_hook ... ... model.train() model = merge_freeze_bn(model) #it will change bn to eval() mode during training ... ``` * Unquantize weight before update it ```python from nnieqat import quant_dequant_weight, unquant_weight, merge_freeze_bn, register_quantization_hook ... ... model.apply(unquant_weight) # using original weight while updating optimizer.step() ... ``` * Dump weight optimized model ```python from nnieqat import quant_dequant_weight, unquant_weight, merge_freeze_bn, register_quantization_hook ... ... model.apply(quant_dequant_weight) save_checkpoint(...) model.apply(unquant_weight) ... ``` * Using EMA with caution(Not recommended).
## Code Examples * [Cifar10 quantization aware training example][cifar10_qat] (add nnieqat into [pytorch_cifar10_tutorial][cifar10_example]) ```python test/test_cifar10.py``` * [ImageNet quantization finetuning example][imagenet_qat] (add nnieqat into [pytorh_imagenet_main.py][imagenet_example]) ```python test/test_imagenet.py --pretrained path_to_imagenet_dataset```
## Results * ImageNet ``` python test/test_imagenet.py /data/imgnet/ --arch squeezenet1_1 --lr 0.001 --pretrained --epoch 10 # nnie_lr_e-3_ft python pytorh_imagenet_main.py /data/imgnet/ --arch squeezenet1_1 --lr 0.0001 --pretrained --epoch 10 # lr_e-4_ft python test/test_imagenet.py /data/imgnet/ --arch squeezenet1_1 --lr 0.0001 --pretrained --epoch 10 # nnie_lr_e-4_ft ``` finetune result: | | trt_fp32 | trt_int8 | nnie | | -------- | -------- | -------- | -------- | | torchvision | 0.56992 | 0.56424 | 0.56026 | | nnie_lr_e-3_ft | 0.56600 | 0.56328 | 0.56612 | | lr_e-4_ft | 0.57884 | 0.57502 | 0.57542 | | nnie_lr_e-4_ft | 0.57834 | 0.57524 | 0.57730 | * coco net: simplified yolov5s train 300 epoches, hi3559 test result: Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.338 Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.540 Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.357 Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.187 Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.377 Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.445 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.284 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.484 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.542 Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.357 Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.595 Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.679 finetune 20 epoches, hi3559 test result: Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.339 Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.539 Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.360 Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.191 Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.378 Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.446 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.285 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.485 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.544 Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.361 Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.596 Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.683
## Todo * Generate quantized model directly.
## Reference HiSVP 量化库使用指南 [Quantizing deep convolutional networks for efficient inference: A whitepaper][quant_whitepaper] [8-bit Inference with TensorRT][trt_quant] [Distilling the Knowledge in a Neural Network][distillingNN] [cifar10_qat]: https://github.com/aovoc/nnieqat-pytorch/blob/master/test/test_cifar10.py [imagenet_qat]: https://github.com/aovoc/nnieqat-pytorch/blob/master/test/test_imagenet.py [imagenet_example]: https://github.com/pytorch/examples/blob/master/imagenet/main.py [cifar10_example]: https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html [quant_whitepaper]: https://arxiv.org/abs/1806.08342 [trt_quant]: https://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf [distillingNN]: https://arxiv.org/abs/1503.02531 ================================================ FILE: build_helper.py ================================================ import os import shutil import subprocess import sys import tempfile from distutils import ccompiler def print_warning(*lines): print('**************************************************') for line in lines: print('*** WARNING: %s' % line) print('**************************************************') def get_path(key): return os.environ.get(key, '').split(os.pathsep) def search_on_path(filenames): for p in get_path('PATH'): for filename in filenames: full = os.path.join(p, filename) if os.path.exists(full): return os.path.abspath(full) return None minimum_cuda_version = 10010 maxinum_cuda_version = 10030 minimum_cudnn_version = 7000 def get_compiler_setting(): nvcc_path = search_on_path(('nvcc', 'nvcc.exe')) cuda_path_default = None if nvcc_path is None: print_warning('nvcc not in path.', 'Please set path to nvcc.') else: cuda_path_default = os.path.normpath( os.path.join(os.path.dirname(nvcc_path), '..')) cuda_path = os.environ.get('CUDA_PATH', '') # Nvidia default on Windows if len(cuda_path) > 0 and cuda_path != cuda_path_default: print_warning('nvcc path != CUDA_PATH', 'nvcc path: %s' % cuda_path_default, 'CUDA_PATH: %s' % cuda_path) if not os.path.exists(cuda_path): cuda_path = cuda_path_default if not cuda_path and os.path.exists('/usr/local/cuda'): cuda_path = '/usr/local/cuda' include_dirs = [] library_dirs = [] define_macros = [] if cuda_path: include_dirs.append(os.path.join(cuda_path, 'include')) if sys.platform == 'win32': library_dirs.append(os.path.join(cuda_path, 'bin')) library_dirs.append(os.path.join(cuda_path, 'lib', 'x64')) else: library_dirs.append(os.path.join(cuda_path, 'lib64')) library_dirs.append(os.path.join(cuda_path, 'lib')) if sys.platform == 'darwin': library_dirs.append('/usr/local/cuda/lib') return { 'include_dirs': include_dirs, 'library_dirs': library_dirs, 'define_macros': define_macros, 'language': 'c++', } def check_cuda_version(): compiler = ccompiler.new_compiler() settings = get_compiler_setting() try: out = build_and_run(compiler, ''' #include #include int main(int argc, char* argv[]) { printf("%d", CUDA_VERSION); return 0; } ''', include_dirs=settings['include_dirs']) except Exception as e: print_warning('Cannot check CUDA version', str(e)) return False cuda_version = int(out) if cuda_version < minimum_cuda_version: print_warning('CUDA version is too old: %d' % cuda_version, 'CUDA v10.1 or CUDA v10.2 is required') return False if cuda_version > maxinum_cuda_version: print_warning('CUDA version is too new: %d' % cuda_version, 'CUDA v10.1 or CUDA v10.2 is required') return True def check_cudnn_version(): compiler = ccompiler.new_compiler() settings = get_compiler_setting() try: out = build_and_run(compiler, ''' #include #include int main(int argc, char* argv[]) { printf("%d", CUDNN_VERSION); return 0; } ''', include_dirs=settings['include_dirs']) except Exception as e: print_warning('Cannot check cuDNN version\n{0}'.format(e)) return False cudnn_version = int(out) if cudnn_version < minimum_cudnn_version: print_warning('cuDNN version is too old: %d' % cudnn_version, 'cuDNN v7 or newer is required') return False return True def build_and_run(compiler, source, libraries=(), include_dirs=(), library_dirs=()): temp_dir = tempfile.mkdtemp() try: fname = os.path.join(temp_dir, 'a.cpp') with open(fname, 'w') as f: f.write(source) objects = compiler.compile([fname], output_dir=temp_dir, include_dirs=include_dirs) try: postargs = ['/MANIFEST'] if sys.platform == 'win32' else [] compiler.link_executable(objects, os.path.join(temp_dir, 'a'), libraries=libraries, library_dirs=library_dirs, extra_postargs=postargs, target_lang='c++') except Exception as e: msg = 'Cannot build a stub file.\nOriginal error: {0}'.format(e) raise Exception(msg) try: out = subprocess.check_output(os.path.join(temp_dir, 'a')) return out except Exception as e: msg = 'Cannot execute a stub file.\nOriginal error: {0}'.format(e) raise Exception(msg) finally: shutil.rmtree(temp_dir, ignore_errors=True) ================================================ FILE: docker/Dockerfile ================================================ ARG PYTORCH="1.6.0" ARG CUDA="10.1" ARG CUDNN="7" FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0+PTX" ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all" ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" RUN apt-get update && apt-get install -y git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* # Install nnieqat RUN pip install nnieqat WORKDIR /root/ ================================================ FILE: docs/Makefile ================================================ # Minimal makefile for Sphinx documentation # # You can set these variables from the command line, and also # from the environment for the first two. SPHINXOPTS ?= SPHINXBUILD ?= sphinx-build SOURCEDIR = source BUILDDIR = build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) ================================================ FILE: docs/make.bat ================================================ @ECHO OFF pushd %~dp0 REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set SOURCEDIR=source set BUILDDIR=build if "%1" == "" goto help %SPHINXBUILD% >NUL 2>NUL if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.http://sphinx-doc.org/ exit /b 1 ) %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% goto end :help %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% :end popd ================================================ FILE: docs/source/build_helper.rst ================================================ build\_helper module ==================== .. automodule:: build_helper :members: :undoc-members: :show-inheritance: ================================================ FILE: docs/source/conf.py ================================================ # -*- coding: utf-8 -*- # import os import sys sys.path.insert(0, os.path.abspath('./../../')) # -- Project information ----------------------------------------------------- project = 'nnieqat' copyright = '2020, Minqin Chen' author = 'Minqin Chen' # The short X.Y version version = '' # The full version, including alpha/beta/rc tags release = '0.1.0' # -- General configuration --------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. # # needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ 'sphinx.ext.todo', 'sphinx.ext.githubpages', 'sphinx.ext.autodoc', ] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] source_suffix = '.rst' # The master toctree document. master_doc = 'index' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = None # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path . exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # # html_theme_options = {} # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] # Custom sidebar templates, must be a dictionary that maps document names # to template names. # # The default sidebars (for documents that don't match any pattern) are # defined by theme itself. Builtin themes are using these templates by # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', # 'searchbox.html']``. # # html_sidebars = {} html_theme = 'sphinx_rtd_theme' ================================================ FILE: docs/source/index.rst ================================================ .. nnieqat documentation master file, created by sphinx-quickstart on Fri Aug 21 03:52:34 2020. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. Welcome to nnieqat's documentation! =================================== .. toctree:: :maxdepth: 2 :caption: Contents: Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` ================================================ FILE: docs/source/modules.rst ================================================ nnieqat ======= .. toctree:: :maxdepth: 4 nnieqat ================================================ FILE: docs/source/nnieqat.cuda10.rst ================================================ nnieqat.cuda10 package ====================== Submodules ---------- nnieqat.cuda10.quantize module ------------------------------ .. automodule:: nnieqat.cuda10.quantize :members: :undoc-members: :show-inheritance: Module contents --------------- .. automodule:: nnieqat.cuda10 :members: :undoc-members: :show-inheritance: ================================================ FILE: docs/source/nnieqat.modules.rst ================================================ nnieqat.modules package ======================= Submodules ---------- nnieqat.modules.conv module --------------------------- .. automodule:: nnieqat.modules.conv :members: :undoc-members: :show-inheritance: nnieqat.modules.linear module ----------------------------- .. automodule:: nnieqat.modules.linear :members: :undoc-members: :show-inheritance: nnieqat.modules.pooling module ------------------------------ .. automodule:: nnieqat.modules.pooling :members: :undoc-members: :show-inheritance: Module contents --------------- .. automodule:: nnieqat.modules :members: :undoc-members: :show-inheritance: ================================================ FILE: docs/source/nnieqat.rst ================================================ nnieqat package =============== Subpackages ----------- .. toctree:: nnieqat.cuda10 nnieqat.gpu nnieqat.modules Module contents --------------- .. automodule:: nnieqat :members: :undoc-members: :show-inheritance: ================================================ FILE: docs/source/setup.rst ================================================ setup module ============ .. automodule:: setup :members: :undoc-members: :show-inheritance: ================================================ FILE: nnieqat/__init__.py ================================================ """ quantize aware training package for Neural Network Inference Engine(NNIE) on pytorch. """ import sys try: from .quantize import quant_dequant_weight, unquant_weight, freeze_bn, \ merge_freeze_bn, register_quantization_hook, test except: raise __all__ = [ "quant_dequant_weight", "unquant_weight", "freeze_bn", "merge_freeze_bn", \ "register_quantization_hook", "test"] test() ================================================ FILE: nnieqat/cuda10/LICENSE.txt ================================================ /* * Copyright (c) 2018, Hisilicon Limited * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ ================================================ FILE: nnieqat/quantize.py ================================================ #!/usr/bin/env python """Quantize function. """ import ctypes import datetime import logging from os.path import abspath, dirname import torch import numpy as np from numba import cuda from quant_impl import fake_quantize _USE_GFPQ_QUANT_LIB = (torch.cuda.device_count() <= 1) class GFPQParamSt(ctypes.Structure): r"""GFPQ param, corresponds with struct GFPQ_PARAM_ST in gfpq.hpp""" _fields_ = [("mode", ctypes.c_int), ("param", ctypes.c_byte * 16)] class _types: r"""Some alias types.""" handle = ctypes.c_void_p stream = ctypes.c_void_p class QuantAndDeQuantGPU(): r"""quantize and dequantize data with GFPG library. """ def __init__(self, libquant_path=dirname(abspath(__file__)) + "/gpu/lib/libgfpq_gpu.so", libcublas_path="libcublas.so", bit_width=8, param_mode=0): global _USE_GFPQ_QUANT_LIB self._bit_width = bit_width if _USE_GFPQ_QUANT_LIB: self._libquant = ctypes.cdll.LoadLibrary(libquant_path) self._libcublas = ctypes.cdll.LoadLibrary(libcublas_path) self._libcublas.cublasCreate_v2.restype = int self._libcublas.cublasCreate_v2.argtypes = [ctypes.c_void_p] self._cublas_handle = _types.handle() self._libcublas.cublasCreate_v2(ctypes.byref(self._cublas_handle)) self._param = GFPQParamSt() self._stream = cuda.stream() self._param.mode = param_mode def __call__(self, tensor, mode=0): r""" Converts float weights to quantized weights. Args: - tensor: input data - mode: GFPQ mode for param GFPQ_MODE_INIT(0): There is no valid parameter in param[]. Generate the parameter and filled in param[]. GFPQ_MODE_UPDATE(1): There is parameter in param[]. Generate new parameter, update param[] when the new parameter is better. GFPQ_MODE_APPLY_ONLY(2): There is parameter in param[]. Don't generate parameter. Just use the param[]. """ global _USE_GFPQ_QUANT_LIB if _USE_GFPQ_QUANT_LIB: try: if isinstance(tensor, tuple): for tensor_item in tensor: data_cuda_array = cuda.as_cuda_array( tensor_item.data.detach()) data_p = data_cuda_array.device_ctypes_pointer self._param.mode = mode ret = self._libquant.HI_GFPQ_QuantAndDeQuant_GPU_PY( data_p, data_cuda_array.size, self._bit_width, ctypes.byref(self._param), self._stream.handle, self._cublas_handle) else: data_cuda_array = cuda.as_cuda_array(tensor.data.detach()) data_p = data_cuda_array.device_ctypes_pointer self._param.mode = mode ret = self._libquant.HI_GFPQ_QuantAndDeQuant_GPU_PY( data_p, data_cuda_array.size, self._bit_width, ctypes.byref(self._param), self._stream.handle, self._cublas_handle) except: pass finally: if ret != 0: _USE_GFPQ_QUANT_LIB = False logger = logging.getLogger(__name__) logger.setLevel(logging.WARNING) logger.warning( """Failed to quantize data with default HiSVP GFPQ library, Use implemented quantization algorithm instead.""") if isinstance(tensor, tuple): for tensor_item in tensor: tensor_item.data = fake_quantize( tensor_item.data.detach().clone(), self._bit_width) else: tensor.data = fake_quantize(tensor.data.detach().clone(), self._bit_width) else: if isinstance(tensor, tuple): for tensor_item in tensor: tensor_item.data = fake_quantize(tensor_item.data.detach().clone(), self._bit_width) else: tensor.data = fake_quantize(tensor.data.detach().clone(), self._bit_width) return tensor _QUANT_HANDLE = QuantAndDeQuantGPU() def _fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b): """ fuse convolution and batch norm's weight. Args: conv_w (torch.nn.Parameter): convolution weight. conv_b (torch.nn.Parameter): convolution bias. bn_rm (torch.nn.Parameter): batch norm running mean. bn_rv (torch.nn.Parameter): batch norm running variance. bn_eps (torch.nn.Parameter): batch norm epsilon. bn_w (torch.nn.Parameter): batch norm weight. bn_b (torch.nn.Parameter): batch norm weight. Returns: conv_w(torch.nn.Parameter): fused convolution weight. conv_b(torch.nn.Parameter): fused convllution bias. """ if conv_b is None: conv_b = bn_rm.new_zeros(bn_rm.shape) bn_var_rsqrt = torch.rsqrt(bn_rv + bn_eps) conv_w = conv_w * \ (bn_w * bn_var_rsqrt).reshape([-1] + [1] * (len(conv_w.shape) - 1)) conv_b = (conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b return torch.nn.Parameter(conv_w), torch.nn.Parameter(conv_b) def _fuse_conv_bn(conv, bn): conv.weight, conv.bias = \ _fuse_conv_bn_weights(conv.weight, conv.bias, bn.running_mean, bn.running_var, bn.eps, bn.weight, bn.bias) return conv def _fuse_modules(model): r"""Fuses a list of modules into a single module Fuses only the following sequence of modules: conv, bn All other sequences are left unchanged. For these sequences, fuse modules on weight level, keep model structure unchanged. Arguments: model: Model containing the modules to be fused Returns: model with fused modules. """ children = list(model.named_children()) conv_module = None conv_name = None for name, child in children: if isinstance(child, (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d)): if isinstance(conv_module, (torch.nn.Conv2d, torch.nn.Conv3d)): conv_module = _fuse_conv_bn(conv_module, child) model._modules[conv_name] = conv_module child.eval() child.running_mean = child.running_mean.new_full( child.running_mean.shape, 0) child.running_var = child.running_var.new_full( child.running_var.shape, 1) if child.weight is not None: child.weight.data = child.weight.data.new_full( child.weight.shape, 1) if child.bias is not None: child.bias.data = child.bias.data.new_full( child.bias.shape, 0) child.track_running_stats = False child.momentum = 0 child.eps = 0 conv_module = None elif isinstance(child, (torch.nn.Conv2d, torch.nn.Conv3d)): conv_module = child conv_name = name else: _fuse_modules(child) return model def freeze_bn(m, freeze_bn_affine=True): """Freeze batch normalization. reference: https://arxiv.org/abs/1806.08342 Args: - m (nn.module): torch module - freeze_bn_affine (bool, optional): Freeze affine scale and translation factor or not. Defaults: True. """ if isinstance( m, (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d)): m.eval() if freeze_bn_affine: m.weight.requires_grad = False m.bias.requires_grad = False def merge_freeze_bn(model): """merge batch norm's weight into convolution, then freeze it. Args: model (nn.module): model. Returns: [nn.module]: model. """ model = _fuse_modules(model) model.apply(freeze_bn) return model def unquant_weight(m): """ unquantize weight before update weight, avoid training turbulence. Args: - m (nn.module): torch module. """ try: if hasattr(m, "weight_origin") and m.weight is not None: m.weight.data.copy_(m.weight_origin.data) except AttributeError: pass except TypeError: pass def quant_dequant_weight(m): """ quant weight manually. Args: - m (nn.module): torch module. """ global _QUANT_HANDLE global _USE_GFPQ_QUANT_LIB quant_handle = _QUANT_HANDLE if not _USE_GFPQ_QUANT_LIB: quant_handle = QuantAndDeQuantGPU() try: if hasattr(m, "weight_origin") and m.weight is not None: m.weight_origin.data.copy_(m.weight.data) m.weight.data = quant_handle(m.weight.data.detach().clone()) except AttributeError: pass except TypeError: pass def _quantizing_activation(module, input, output): if isinstance( module, (torch.nn.ReLU, torch.nn.ELU, torch.nn.LeakyReLU, torch.nn.PReLU)): global _QUANT_HANDLE global _USE_GFPQ_QUANT_LIB quant_handle = _QUANT_HANDLE if not _USE_GFPQ_QUANT_LIB: quant_handle = QuantAndDeQuantGPU() # print("quantizing activation.") # print(output[0][0][0]) output_type = output.dtype module.activation_max_value = torch.max(torch.max(torch.abs(output.detach())), module.activation_max_value.to(output_type)) # print(module.activation_max_value) tensor_t = torch.cat((output, torch.ones(output[0].shape).cuda().unsqueeze(0) * module.activation_max_value)) output.data = quant_handle(tensor_t.float())[:-1] output = output.to(output_type) # print(output[0][0][0]) def _quantizing_data(module, input): global _QUANT_HANDLE global _USE_GFPQ_QUANT_LIB quant_handle = _QUANT_HANDLE if not _USE_GFPQ_QUANT_LIB: quant_handle = QuantAndDeQuantGPU() # print("quantizing data.") # print(input[0][0][0]) # print("quantizing data.") # print(input[0][0][0]) # input_type = input.dtype if isinstance(input, tuple): for item in input: item_type = item.dtype item = quant_handle(item.float()) item.to(item_type) else: input = quant_handle(input.float()) # input = input.to(input_type) # print(input[0][0][0]) def _quantizing_weight(module, input): global _QUANT_HANDLE global _USE_GFPQ_QUANT_LIB quant_handle = _QUANT_HANDLE if not _USE_GFPQ_QUANT_LIB: quant_handle = QuantAndDeQuantGPU() # print("quantizing weight.") # print(module.weight[0][0][0]) module.weight_origin.data.copy_(module.weight.data) module.weight.data = quant_handle(module.weight.data.detach().clone()) # print(module.weight[0][0][0]) def register_quantization_hook(model, quant_weight=True, quant_activation=True, quant_data=False): """register quantization hook for model. Args: model (:class:`Module`): Module. Returns: Module: self """ # weight quantizing. logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) for _, module in model._modules.items(): if len(list(module.children())) > 0: register_quantization_hook(module, quant_weight, quant_activation) else: if quant_weight and hasattr( module, "weight") and module.weight is not None and not isinstance( module, (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d)): module.register_buffer('weight_origin', module.weight.detach().clone()) if quant_data: module.register_forward_pre_hook(_quantizing_data) logger.info("Quantizing input data of %s", str(module)) module.register_forward_pre_hook(_quantizing_weight) logger.info("Quantizing weight of %s", str(module)) if quant_activation and isinstance( module, (torch.nn.ReLU, torch.nn.ELU, torch.nn.LeakyReLU, torch.nn.PReLU)): module.register_buffer("activation_max_value", torch.tensor(0, dtype=torch.float).cuda()) module.register_forward_hook(_quantizing_activation) logger.info("Quantizing activation of %s", str(module)) return model def test(): r""" Test GFPG library QuantAndDeQuantGPU. """ quant_handle = QuantAndDeQuantGPU() logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) tensor = torch.Tensor(np.array([-9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])).cuda() logging.info("Origin Data: ") logging.info(tensor) start_time = datetime.datetime.now() quant_tensor = quant_handle(tensor) end_time = datetime.datetime.now() logging.info("Quant Data: ") logging.info(quant_tensor) data_expected = np.array([ -8.7240619659, 0.0000000000, 1.0000000000, 2.0000000000, 2.9536523819, 4.0000000000, 4.9674310684, 5.9073047638, 7.0250086784, 8.0000000000, 8.7240619659 ]) logging.info("Data expected: ") logging.info(" ".join([str(v) for v in data_expected])) data_diff = quant_tensor.data.detach().cpu().numpy() - data_expected flag = "success." for num in data_diff: if abs(num) > 0.000000001: flag = "failed." run_time = end_time - start_time logging.info("QuantAndDeQuantGPU time: %s", str(run_time)) logging.info("QuantAndDeQuantGPU %s", flag) ================================================ FILE: pyproject.toml ================================================ [build-system] requires = ["setuptools>=40.8.0", "wheel"] build-backend = "setuptools.build_meta" ================================================ FILE: setup.cfg ================================================ [metadata] license_files = LICENSE.txt ================================================ FILE: setup.py ================================================ from setuptools import setup, find_packages import pathlib from torch.utils.cpp_extension import BuildExtension, CUDAExtension from build_helper import check_cuda_version assert(check_cuda_version()) import os os.system('make -j%d' % os.cpu_count()) here = pathlib.Path(__file__).parent.resolve() long_description = (here / 'README.md').read_text(encoding='utf-8') setup( name='nnieqat', version='0.1.0', description='A nnie quantization aware training tool on pytorch.', long_description=long_description, long_description_content_type='text/markdown', url='https://github.com/aovoc/nnieqat-pytorch', author='Minqin Chen', author_email='minqinchen@deepglint.com', license='MIT', classifiers=[ 'Development Status :: 5 - Production/Stable', "Intended Audience :: Science/Research", 'Intended Audience :: Developers', "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Software Development :: Libraries :: Python Modules", 'License :: OSI Approved :: MIT License', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3 :: Only', ], keywords=[ "quantization aware training", "deep learning", "neural network", "CNN", "machine learning", ], packages=find_packages(), package_data={ "nnieqat": ["gpu/lib/*gfpq*"], }, python_requires='>=3.5, <4', install_requires=[ "torch>=1.5", "numba>=0.42.0", "numpy>=1.18.1" ], extras_require={ 'test': ["torchvision>=0.4", "nose", "ddt" ], 'docs': [ 'sphinx==2.4.4', 'sphinx_rtd_theme' ] }, ext_modules=[ CUDAExtension( name="quant_impl", sources=[ "./src/fake_quantize.cpp", ], libraries=['quant_impl'], library_dirs=['obj'], ) ], cmdclass={'build_ext': BuildExtension}, test_suite="nnieqat.test.test_cifar10", ) ================================================ FILE: src/fake_quantize.cpp ================================================ #include "fake_quantize.h" #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) Tensor fake_quantize(Tensor a, int bit_width){ CHECK_INPUT(a); return fake_quantize_cuda(a, bit_width); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){ m.def("fake_quantize", &fake_quantize, "NNIE Fake Quantization (CUDA)"); } ================================================ FILE: src/fake_quantize.cu ================================================ #include "fake_quantize.h" __global__ void fake_quantize_kernel_cuda(float* __restrict__ a, float* o, int size, float* max_entry, int bit_width) { if(bit_width!=8) bit_width =16; int index = blockIdx.x * blockDim.x + threadIdx.x; if (index < size) { if((*max_entry) < 1e-15 && (*max_entry) > -1e-15){ o[index] = 0; return; } if(bit_width == 8){ float data_max = (*max_entry); int max_entry_qdata_int = floorf(__log2f(data_max) * 16) + 1; data_max = __powf(2, __fdividef(max_entry_qdata_int, 16)); float data_max_floor = __powf(2, __fdividef(max_entry_qdata_int-1, 16)); if(a[index] <= data_max_floor * 0.0020395972313035 // exp(ln(256) / 128) / 512= 2^(1/16-9) = 1.0442737824274 /512 = 0.0020395972313035 && a[index] > - data_max * 0.0020395972313035){ o[index] = 0; return; } //int qdata_int = (int)(log(256 * a[index] / data_max ) / 0.04332169878499658); //ln(256) / 128 = 0.04332169878499658 int qdata_int = 0; if(a[index] > 0){ qdata_int = rintf(__fdividef( __logf(__fdividef(256* a[index],data_max)), 0.04332169878499658)); //ln(256) / 128 = 0.04332169878 if(qdata_int > 127) qdata_int = 127; else if(qdata_int < 0) qdata_int = 0; o[index] = __fdividef(data_max , 256.0) * __expf(qdata_int*0.04332169878499658); } else{ qdata_int = - rintf(__fdividef( __logf(__fdividef(- 256* a[index], data_max)), 0.04332169878499658)); //ln(256) / 128 = 0.04332169878 if(qdata_int < -127) qdata_int = -127; else if(qdata_int >-1) qdata_int = -1; o[index] = - __fdividef(data_max , 256.0) * __expf(- qdata_int*0.04332169878499658); } } else{ float data_max = (*max_entry); int max_entry_qdata_int = floorf(__log2f(data_max) * 128) + 1; data_max = __powf(2, __fdividef(max_entry_qdata_int, 128)); float data_max_floor = __powf(2, __fdividef(max_entry_qdata_int-1, 16)); if(a[index] < data_max_floor *0.0019537861485404 //exp(ln(2^16)/(2^15)) / 512 = 0.0019537861485404 && a[index] > - data_max * 0.0019537861485404){ o[index] = 0; return; } int qdata_int = 0; if(a[index] > 0){ qdata_int = rintf(__fdividef( __logf(__fdividef(65536* a[index], data_max)), 0.00033845077175779)); if(qdata_int > 32767) qdata_int = 32767; else if(qdata_int <0) qdata_int = 0; o[index] = __fdividef(data_max , 65536.0) * __expf(qdata_int * 0.00033845077175779); } else{ qdata_int = - rintf(__fdividef( __logf(__fdividef(- 65536* a[index], data_max)), 0.00033845077175779)); if(qdata_int < -32767) qdata_int = -32767; else if(qdata_int >-1) qdata_int = -1; o[index] = - __fdividef(data_max , 65536.0) * __expf(- qdata_int * 0.00033845077175779); } } } } Tensor fake_quantize_cuda(Tensor a, int bit_width) { auto o = at::zeros_like(a); int64_t size = a.numel(); Tensor max_entry = at::max(at::abs(a)); int blockSize = 1024; int blockNums = (size + blockSize - 1) / blockSize; fake_quantize_kernel_cuda<<>>(a.data_ptr(), o.data_ptr(), size, max_entry.data_ptr(), bit_width); return o; } ================================================ FILE: src/fake_quantize.h ================================================ #include #include #include #include #include #include #include #include #include using namespace at; Tensor fake_quantize(Tensor a, int bit_width=8); Tensor fake_quantize_cuda(Tensor a, int bit_width=8); __global__ void fake_quantize_kernel_cuda(float* __restrict__ a, float* o, int size, float* max_entry, int bit_width=8); ================================================ FILE: src/test/Makefile ================================================ # Uncomment for debugging DEBUG := 1 # Pretty build # Q ?= @ CXX := g++ python := python3 PYTHON_HEADER_DIR := $(shell python -c 'from distutils.sysconfig import get_python_inc; print(get_python_inc())') PYTORCH_INCLUDES := $(shell python -c 'from torch.utils.cpp_extension import include_paths; [print(p) for p in include_paths()]') PYTORCH_LIBRARIES := $(shell python -c 'from torch.utils.cpp_extension import library_paths; [print(p) for p in library_paths()]') CUDA_DIR := $(shell python -c 'from torch.utils.cpp_extension import _find_cuda_home; print(_find_cuda_home())') WITH_ABI := $(shell python -c 'import torch; print(int(torch._C._GLIBCXX_USE_CXX11_ABI))') INCLUDE_DIRS := ./ $(CUDA_DIR)/include INCLUDE_DIRS += $(PYTHON_HEADER_DIR) INCLUDE_DIRS += $(PYTORCH_INCLUDES) # Custom (MKL/ATLAS/OpenBLAS) include and lib directories. # BLAS_INCLUDE := /path/to/your/blas # BLAS_LIB := /path/to/your/blas SRC_DIR := ./ OBJ_DIR := ./obj CPP_SRCS := $(wildcard $(SRC_DIR)/*.cpp) CU_SRCS := $(wildcard $(SRC_DIR)/*.cu) OBJS := $(patsubst $(SRC_DIR)/%.cpp,$(OBJ_DIR)/%.o,$(CPP_SRCS)) CU_OBJS := $(patsubst $(SRC_DIR)/%.cu,$(OBJ_DIR)/cuda/%.o,$(CU_SRCS)) STATIC_LIB := $(OBJ_DIR)/libquant_impl.a CUDA_ARCH := -gencode arch=compute_50,code=sm_50 \ -gencode arch=compute_52,code=sm_52 \ -gencode arch=compute_60,code=sm_60 \ -gencode arch=compute_61,code=sm_61 \ -gencode arch=compute_70,code=sm_70 \ -gencode arch=compute_75,code=sm_75 \ -gencode arch=compute_75,code=compute_75 LIBRARIES += stdc++ cudart c10 caffe2 torch torch_python caffe2_gpu ifeq ($(DEBUG), 1) COMMON_FLAGS += -DDEBUG -g -O0 NVCCFLAGS += -g -G # -rdc true else COMMON_FLAGS += -DNDEBUG -O3 endif WARNINGS := -Wall -Wno-sign-compare -Wcomment INCLUDE_DIRS += $(BLAS_INCLUDE) CXXFLAGS += -MMD -MP COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) \ -DTORCH_API_INCLUDE_EXTENSION_H -D_GLIBCXX_USE_CXX11_ABI=$(WITH_ABI) CXXFLAGS += -pthread -fPIC -fwrapv -std=c++14 $(COMMON_FLAGS) $(WARNINGS) NVCCFLAGS += -std=c++14 -ccbin=$(CXX) -Xcompiler -fPIC -use_fast_math $(COMMON_FLAGS) default: $(STATIC_LIB) $(OBJ_DIR): @ mkdir -p $@ @ mkdir -p $@/cuda $(OBJ_DIR)/%.o: $(SRC_DIR)/%.cpp | $(OBJ_DIR) @ echo CXX $< $(Q)$(CXX) $< $(CXXFLAGS) -c -o $@ $(OBJ_DIR)/cuda/%.o: $(SRC_DIR)/%.cu | $(OBJ_DIR) @ echo NVCC $< $(Q)nvcc $(NVCCFLAGS) $(CUDA_ARCH) -M $< -o ${@:.o=.d} \ -odir $(@D) $(Q)nvcc $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@ $(STATIC_LIB): $(OBJS) $(CU_OBJS) | $(OBJ_DIR) $(RM) -f $(STATIC_LIB) $(RM) -rf build dist @ echo LD -o $@ ar rc $(STATIC_LIB) $(OBJS) $(CU_OBJS) build: $(python) setup.py build upload: $(python) setup.py sdist bdist_wheel #twine upload dist/* clean: $(RM) -rf build dist nnieqat.egg-info obj test: nosetests -s tests/test_quant_impl.py --nologcapture lint: pylint nnieqat --reports=n lintfull: pylint nnieqat install: $(python) setup.py install uninstall: $(python) setup.py install --record install.log cat install.log | xargs rm -rf $(RM) install.log ================================================ FILE: src/test/test.cu ================================================ #include #include "../fake_quantize.h" int main(int argc, char *argv[]) { Tensor input = randn({2, 2}); fake_quantize(input, 8); return 0; } ================================================ FILE: tests/test_cifar10.py ================================================ # -*- coding:utf-8 -*- from nnieqat import quant_dequant_weight, unquant_weight, merge_freeze_bn, register_quantization_hook import unittest import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torch.autograd import Variable import torchvision import torchvision.transforms as transforms class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.conv1 = torch.nn.Conv2d(3, 6, 5) self.pool = torch.nn.MaxPool2d(2, 2) self.conv2 = torch.nn.Conv2d(6, 16, 5) self.fc1 = torch.nn.Linear(16 * 5 * 5, 120) self.fc2 = torch.nn.Linear(120, 84) self.fc3 = torch.nn.Linear(84, 10) def forward(self, x): x = self.pool(F.relu(self.conv1(x))) x = self.pool(F.relu(self.conv2(x))) x = x.view(-1, 16 * 5 * 5) x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = self.fc3(x) return x class TestCifar10(unittest.TestCase): def test(self): transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform) trainloader = torch.utils.data.DataLoader(trainset, batch_size=4, shuffle=True, num_workers=2) testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform) testloader = torch.utils.data.DataLoader(testset, batch_size=4, shuffle=True, num_workers=2) dataiter = iter(trainloader) images, labels = dataiter.next() net = Net() register_quantization_hook(net) net.cuda() criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) print("Cifar10 training:") for epoch in range(5): net.train() if epoch > 2: net = merge_freeze_bn(net) running_loss = 0.0 for i, data in enumerate(trainloader, 0): inputs, labels = data inputs, labels = Variable(inputs.cuda()), Variable( labels.cuda()) optimizer.zero_grad() outputs = net(inputs) loss = criterion(outputs, labels) loss.backward() net.apply(unquant_weight) optimizer.step() running_loss += loss.item() if i % 2000 == 1999: print(' epoch %3d, Iter %5d, loss: %.3f' % (epoch + 1, i + 1, running_loss / 2000)) running_loss = 0.0 print('Finished Training.') # net.apply(quant_dequant_weight) correct = total = 0 for data in testloader: images, labels = data outputs = net(Variable(images.cuda())) _, predicted = torch.max(outputs.data, 1) correct += (predicted == labels.cuda()).sum() total += labels.size(0) print( 'Accuracy(10000 test images, modules\' weight unquantize): %d %%' % (100.0 * correct / total)) if __name__ == "__main__": suite = unittest.TestSuite() suite.addTest(TestCifar10("test")) runner = unittest.TextTestRunner() runner.run(suite) ================================================ FILE: tests/test_imagenet.py ================================================ import argparse import os import random import shutil import time import warnings from nnieqat import quant_dequant_weight, unquant_weight, merge_freeze_bn, register_quantization_hook import torch import torch.nn as nn import torch.nn.parallel import torch.backends.cudnn as cudnn import torch.distributed as dist import torch.optim import torch.multiprocessing as mp import torch.utils.data import torch.utils.data.distributed import torchvision.transforms as transforms import torchvision.datasets as datasets import torchvision.models as models model_names = sorted(name for name in models.__dict__ if name.islower() and not name.startswith("__") and callable(models.__dict__[name])) parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') parser.add_argument('data', metavar='DIR', help='path to dataset') parser.add_argument('-a', '--arch', metavar='ARCH', default='squeezenet1_1', choices=model_names, help='model architecture: ' + ' | '.join(model_names) + ' (default: resnet18)') parser.add_argument('-j', '--workers', default=32, type=int, metavar='N', help='number of data loading workers (default: 4)') parser.add_argument('--epochs', default=120, type=int, metavar='N', help='number of total epochs to run') parser.add_argument('--start-epoch', default=0, type=int, metavar='N', help='manual epoch number (useful on restarts)') parser.add_argument('-b', '--batch-size', default=256, type=int, metavar='N', help='mini-batch size (default: 256), this is the total ' 'batch size of all GPUs on the current node when ' 'using Data Parallel or Distributed Data Parallel') parser.add_argument('--lr', '--learning-rate', default=0.001, type=float, metavar='LR', help='initial learning rate', dest='lr') parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum') parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, metavar='W', help='weight decay (default: 1e-4)', dest='weight_decay') parser.add_argument('-p', '--print-freq', default=10, type=int, metavar='N', help='print frequency (default: 10)') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', help='evaluate model on validation set') parser.add_argument('--pretrained', dest='pretrained', action='store_true', help='use pre-trained model') parser.add_argument('--world-size', default=-1, type=int, help='number of nodes for distributed training') parser.add_argument('--rank', default=-1, type=int, help='node rank for distributed training') parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, help='url used to set up distributed training') parser.add_argument('--dist-backend', default='nccl', type=str, help='distributed backend') parser.add_argument('--seed', default=None, type=int, help='seed for initializing training. ') parser.add_argument('--gpu', default=None, type=int, help='GPU id to use.') parser.add_argument('--multiprocessing-distributed', action='store_true', help='Use multi-processing distributed training to launch ' 'N processes per node, which has N GPUs. This is the ' 'fastest way to use PyTorch for either single node or ' 'multi node data parallel training') best_acc1 = 0 def main(): args = parser.parse_args() if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') if args.gpu is not None: warnings.warn('You have chosen a specific GPU. This will completely ' 'disable data parallelism.') if args.dist_url == "env://" and args.world_size == -1: args.world_size = int(os.environ["WORLD_SIZE"]) args.distributed = args.world_size > 1 or args.multiprocessing_distributed ngpus_per_node = torch.cuda.device_count() if args.multiprocessing_distributed: # Since we have ngpus_per_node processes per node, the total world_size # needs to be adjusted accordingly args.world_size = ngpus_per_node * args.world_size # Use torch.multiprocessing.spawn to launch distributed processes: the # main_worker process function mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) else: # Simply call main_worker function main_worker(args.gpu, ngpus_per_node, args) def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() register_quantization_hook(model) if not torch.cuda.is_available(): print('using CPU, this will be slow') elif args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set acc1 = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): # dump weight quantized model. model.apply(quant_dequant_weight) save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best) model.apply(unquant_weight) def train(train_loader, model, criterion, optimizer, epoch, args): batch_time = AverageMeter('Time', ':6.3f') data_time = AverageMeter('Data', ':6.3f') losses = AverageMeter('Loss', ':.4e') top1 = AverageMeter('Acc@1', ':6.2f') top5 = AverageMeter('Acc@5', ':6.2f') progress = ProgressMeter( len(train_loader), [batch_time, data_time, losses, top1, top5], prefix="Epoch: [{}]".format(epoch)) # switch to train mode model.train() model = merge_freeze_bn(model) end = time.time() for i, (images, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) if args.gpu is not None: images = images.cuda(args.gpu, non_blocking=True) if torch.cuda.is_available(): target = target.cuda(args.gpu, non_blocking=True) # compute output output = model(images) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() model.apply(unquant_weight) optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i) def validate(val_loader, model, criterion, args): batch_time = AverageMeter('Time', ':6.3f') losses = AverageMeter('Loss', ':.4e') top1 = AverageMeter('Acc@1', ':6.2f') top5 = AverageMeter('Acc@5', ':6.2f') progress = ProgressMeter( len(val_loader), [batch_time, losses, top1, top5], prefix='Test: ') # switch to evaluate mode model.eval() with torch.no_grad(): end = time.time() for i, (images, target) in enumerate(val_loader): if args.gpu is not None: images = images.cuda(args.gpu, non_blocking=True) if torch.cuda.is_available(): target = target.cuda(args.gpu, non_blocking=True) # compute output output = model(images) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i) # TODO: this should also be done with the ProgressMeter print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}' .format(top1=top1, top5=top5)) return top1.avg def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): torch.save(state, filename) if is_best: shutil.copyfile(filename, 'model_best.pth.tar') class AverageMeter(object): """Computes and stores the average and current value""" def __init__(self, name, fmt=':f'): self.name = name self.fmt = fmt self.reset() def reset(self): self.val = 0 self.avg = 0 self.sum = 0 self.count = 0 def update(self, val, n=1): self.val = val self.sum += val * n self.count += n self.avg = self.sum / self.count def __str__(self): fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' return fmtstr.format(**self.__dict__) class ProgressMeter(object): def __init__(self, num_batches, meters, prefix=""): self.batch_fmtstr = self._get_batch_fmtstr(num_batches) self.meters = meters self.prefix = prefix def display(self, batch): entries = [self.prefix + self.batch_fmtstr.format(batch)] entries += [str(meter) for meter in self.meters] print('\t'.join(entries)) def _get_batch_fmtstr(self, num_batches): num_digits = len(str(num_batches // 1)) fmt = '{:' + str(num_digits) + 'd}' return '[' + fmt + '/' + fmt.format(num_batches) + ']' def adjust_learning_rate(optimizer, epoch, args): """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" lr = args.lr * (0.975 ** (epoch // 3)) for param_group in optimizer.param_groups: param_group['lr'] = lr def accuracy(output, target, topk=(1,)): """Computes the accuracy over the k top predictions for the specified values of k""" with torch.no_grad(): maxk = max(topk) batch_size = target.size(0) _, pred = output.topk(maxk, 1, True, True) pred = pred.t() correct = pred.eq(target.view(1, -1).expand_as(pred)) res = [] for k in topk: correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) res.append(correct_k.mul_(100.0 / batch_size)) return res if __name__ == '__main__': main() ================================================ FILE: tests/test_merge_freeze_bn.py ================================================ # -*- coding:utf-8 -*- import unittest from ddt import ddt, data import torch from torch import nn from nnieqat import merge_freeze_bn, freeze_bn @ddt class TestMergeFreezeBNImpl(unittest.TestCase): def conv_bn(inp, oup, stride, conv_layer=nn.Conv2d, norm_layer=nn.BatchNorm2d): return nn.Sequential(conv_layer(inp, oup, 3, stride, 1, bias=False), norm_layer(oup)) def conv_1x1_bn(inp, oup, conv_layer=nn.Conv2d, norm_layer=nn.BatchNorm2d): return nn.Sequential(conv_layer(inp, oup, 1, 1, 0, bias=False), norm_layer(oup)) data1 = conv_bn(3, 3, 2) data2 = conv_1x1_bn(3, 3) @data(data1, data2) def test(self, m): input = torch.randn(1, 3, 10, 10) m.eval() output_0 = m(input) print("module parameter before merge_freeze_bn: ") print(list(m.named_parameters())) m = merge_freeze_bn(m) m.eval() output_1 = m(input) print("module parameter after merge_freeze_bn: ") print(list(m.named_parameters())) print("output result before merge_freeze_bn: ") print(output_0) print("output result after merge_freeze_bn: ") print(output_1) print("output result diff: ") print(output_0 - output_1) if __name__ == "__main__": suite = unittest.TestSuite() suite.addTest(TestMergeFreezeBNImpl("test")) runner = unittest.TextTestRunner() runner.run(suite) ================================================ FILE: tests/test_quant_impl.py ================================================ # -*- coding:utf-8 -*- import unittest from ddt import ddt, data import math import ctypes import datetime from ctypes import * import numpy as np from numba import cuda import numpy as np import os os.environ["CUDA_VISIBLE_DEVICES"] = "0" @ddt class TestQuantImpl(unittest.TestCase): max_thres = 512 data0 = np.array([0]) data1 = np.array([v / 25600 + 1.04 for v in range(25600)] + [100, max_thres]) data2 = np.array([v / 25600 + 1.04 for v in range(25600)] + [100, max_thres]) data2 = np.array([-v / 25600 - 1.04 for v in range(25600)] + [-100, -max_thres]) data3 = np.array( [0, 1, 2, 2.03992188, 2.03996094, 3, 4, 5, 10, 100, max_thres]) max_thres = 513 data4 = np.array([v / 25600 + 1.04 for v in range(25600)] + [100, max_thres]) data5 = np.array([v / 25600 + 1.04 for v in range(25600)] + [100, max_thres]) data6 = np.array([-v / 25600 - 1.04 for v in range(25600)] + [-100, -max_thres]) data7 = np.array( [0, 1, 2, 2.03992188, 2.03996094, 3, 4, 5, 10, 100, max_thres]) data8 = np.array([ 0, -1, -2, -2.03992188, -2.03996094, -3, -4, -5, -10, -100, -max_thres ]) data9 = np.array(range(1234)) data10 = np.array([-v for v in range(1234)]) @data(data0, data1, data2, data3, data4, data5, data6, data7, data8, data9, data10) def test(self, data): os.environ['CUDA_VISIBLE_DEVICES'] = '0' # load library dl = ctypes.cdll.LoadLibrary quant_lib = dl("nnieqat/gpu/lib/libgfpq_gpu.so") _libcublas = ctypes.cdll.LoadLibrary("libcublas.so") # struct GFPQ_PARAM_ST in gfpq.hpp class GFPQ_PARAM_ST(ctypes.Structure): _fields_ = [("mode", ctypes.c_int), ("buf", ctypes.c_byte * 16)] class _types: """Some alias types.""" handle = ctypes.c_void_p stream = ctypes.c_void_p data_origin = data.copy() print( "----------------------------------------------------------------------" ) print("\n\nOriginal data:") print(data) data = data.astype(np.float32) stream = cuda.stream() _libcublas.cublasCreate_v2.restype = int _libcublas.cublasCreate_v2.argtypes = [ctypes.c_void_p] cublas_handle = _types.handle() _libcublas.cublasCreate_v2(ctypes.byref(cublas_handle)) data_gpu = cuda.to_device(data, stream=stream) data_p = data_gpu.device_ctypes_pointer bit_width = 8 param = GFPQ_PARAM_ST() # init or update param first param.mode = 0 ret = quant_lib.HI_GFPQ_QuantAndDeQuant_GPU_PY(data_p, data.size, bit_width, ctypes.byref(param), stream.handle, cublas_handle) if ret != 0: print("HI_GFPQ_QuantAndDeQuant failed(%d)\n" % (ret)), # use apply param param.mode = 2 ret = quant_lib.HI_GFPQ_QuantAndDeQuant_GPU_PY(data_p, data.size, bit_width, ctypes.byref(param), stream.handle, cublas_handle) if ret != 0: print("HI_GFPQ_QuantAndDeQuant failed(%d)" % (ret)), data_gpu.copy_to_host(data, stream=stream) # data may not be available stream.synchronize() _libcublas.cublasDestroy_v2(cublas_handle) import nnieqat from quant_impl import fake_quantize import torch tensor = torch.Tensor(data_origin).cuda() tensor.data = fake_quantize(tensor.data.detach(), 8) diff = abs(tensor.cpu().numpy() - data) # diff_thres = np.max(abs(data)) * 0.001 # print("\nDIFF > 0.1%: ") # print("idx: ", np.where(diff > diff_thres)) # print("Original data:", data_origin[np.where(diff > diff_thres)]) # print("GFPQ result:", data[np.where(diff > diff_thres)]) # print("Impl result:", tensor.cpu().numpy()[np.where(diff > diff_thres)]) diff_max = np.max(diff) print("\nDIFF MAX: " + str(diff_max)) print("\nDIFF RATIO: " + str(diff_max / max(np.max(abs(data)), pow(10, -18)))) if __name__ == "__main__": suite = unittest.TestSuite() suite.addTest(TestQuantImpl("test")) runner = unittest.TextTestRunner() runner.run(suite)