Repository: aovoc/nnieqat-pytorch
Branch: master
Commit: 91410cf331a1
Files: 36
Total size: 75.0 KB
Directory structure:
gitextract_yis4nxki/
├── LICENSE.txt
├── MANIFEST.in
├── Makefile
├── README.md
├── build_helper.py
├── docker/
│ └── Dockerfile
├── docs/
│ ├── Makefile
│ ├── make.bat
│ └── source/
│ ├── build_helper.rst
│ ├── conf.py
│ ├── index.rst
│ ├── modules.rst
│ ├── nnieqat.cuda10.rst
│ ├── nnieqat.modules.rst
│ ├── nnieqat.rst
│ └── setup.rst
├── nnieqat/
│ ├── __init__.py
│ ├── cuda10/
│ │ ├── LICENSE.txt
│ │ └── lib/
│ │ ├── gfpq.lib
│ │ ├── libgfpq.a
│ │ ├── libgfpq.so.1.1.5
│ │ ├── libgfpq_gpu.a
│ │ └── libgfpq_gpu.so.1.1.5
│ └── quantize.py
├── pyproject.toml
├── setup.cfg
├── setup.py
├── src/
│ ├── fake_quantize.cpp
│ ├── fake_quantize.cu
│ ├── fake_quantize.h
│ └── test/
│ ├── Makefile
│ └── test.cu
└── tests/
├── test_cifar10.py
├── test_imagenet.py
├── test_merge_freeze_bn.py
└── test_quant_impl.py
================================================
FILE CONTENTS
================================================
================================================
FILE: LICENSE.txt
================================================
MIT License
Copyright (c) Minqin Chen
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: MANIFEST.in
================================================
================================================
FILE: Makefile
================================================
# Uncomment for debugging
# DEBUG := 1
# Pretty build
# Q ?= @
CXX := g++
python := python3
PYTHON_HEADER_DIR := $(shell python -c 'from distutils.sysconfig import get_python_inc; print(get_python_inc())')
PYTORCH_INCLUDES := $(shell python -c 'from torch.utils.cpp_extension import include_paths; [print(p) for p in include_paths()]')
PYTORCH_LIBRARIES := $(shell python -c 'from torch.utils.cpp_extension import library_paths; [print(p) for p in library_paths()]')
CUDA_DIR := $(shell python -c 'from torch.utils.cpp_extension import _find_cuda_home; print(_find_cuda_home())')
WITH_ABI := $(shell python -c 'import torch; print(int(torch._C._GLIBCXX_USE_CXX11_ABI))')
INCLUDE_DIRS := ./ $(CUDA_DIR)/include
INCLUDE_DIRS += $(PYTHON_HEADER_DIR)
INCLUDE_DIRS += $(PYTORCH_INCLUDES)
# Custom (MKL/ATLAS/OpenBLAS) include and lib directories.
# BLAS_INCLUDE := /path/to/your/blas
# BLAS_LIB := /path/to/your/blas
SRC_DIR := ./src
OBJ_DIR := ./obj
CPP_SRCS := $(wildcard $(SRC_DIR)/*.cpp)
CU_SRCS := $(wildcard $(SRC_DIR)/*.cu)
OBJS := $(patsubst $(SRC_DIR)/%.cpp,$(OBJ_DIR)/%.o,$(CPP_SRCS))
CU_OBJS := $(patsubst $(SRC_DIR)/%.cu,$(OBJ_DIR)/cuda/%.o,$(CU_SRCS))
STATIC_LIB := $(OBJ_DIR)/libquant_impl.a
CUDA_ARCH := -gencode arch=compute_50,code=sm_50 \
-gencode arch=compute_52,code=sm_52 \
-gencode arch=compute_60,code=sm_60 \
-gencode arch=compute_61,code=sm_61 \
-gencode arch=compute_70,code=sm_70 \
-gencode arch=compute_75,code=sm_75 \
-gencode arch=compute_75,code=compute_75
LIBRARIES += stdc++ cudart c10 caffe2 torch torch_python caffe2_gpu
ifeq ($(DEBUG), 1)
COMMON_FLAGS += -DDEBUG -g -O0
NVCCFLAGS += -g -G # -rdc true
else
COMMON_FLAGS += -DNDEBUG -O3
endif
WARNINGS := -Wall -Wno-sign-compare -Wcomment
INCLUDE_DIRS += $(BLAS_INCLUDE)
CXXFLAGS += -MMD -MP
COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) \
-DTORCH_API_INCLUDE_EXTENSION_H -D_GLIBCXX_USE_CXX11_ABI=$(WITH_ABI)
CXXFLAGS += -pthread -fPIC -fwrapv -std=c++14 $(COMMON_FLAGS) $(WARNINGS)
NVCCFLAGS += -std=c++14 -ccbin=$(CXX) -Xcompiler -fPIC -use_fast_math $(COMMON_FLAGS)
default: $(STATIC_LIB)
$(OBJ_DIR):
@ mkdir -p $@
@ mkdir -p $@/cuda
$(OBJ_DIR)/%.o: $(SRC_DIR)/%.cpp | $(OBJ_DIR)
@ echo CXX $<
$(Q)$(CXX) $< $(CXXFLAGS) -c -o $@
$(OBJ_DIR)/cuda/%.o: $(SRC_DIR)/%.cu | $(OBJ_DIR)
@ echo NVCC $<
$(Q)nvcc $(NVCCFLAGS) $(CUDA_ARCH) -M $< -o ${@:.o=.d} \
-odir $(@D)
$(Q)nvcc $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@
$(STATIC_LIB): $(OBJS) $(CU_OBJS) | $(OBJ_DIR)
$(RM) -f $(STATIC_LIB)
$(RM) -rf build dist
@ echo LD -o $@
ar rc $(STATIC_LIB) $(OBJS) $(CU_OBJS)
build:
$(python) setup.py build
upload:
$(python) setup.py sdist bdist_wheel
#twine upload dist/*
clean:
$(RM) -rf build dist nnieqat.egg-info
test:
nosetests -s tests/test_quant_impl.py --nologcapture
nosetests -s tests/test_merge_freeze_bn.py --nologcapture
lint:
pylint nnieqat --reports=n
lintfull:
pylint nnieqat
install:
$(python) setup.py install
uninstall:
$(python) setup.py install --record install.log
cat install.log | xargs rm -rf
$(RM) install.log
================================================
FILE: README.md
================================================
# nnieqat-pytorch
Nnieqat is a quantize aware training package for Neural Network Inference Engine(NNIE) on pytorch, it uses hisilicon quantization library to quantize module's weight and activation as fake fp32 format.
## Table of Contents
- [nnieqat-pytorch](#nnieqat-pytorch)
- [Table of Contents](#table-of-contents)
- [Installation](#installation)
- [Usage](#usage)
- [Code Examples](#code-examples)
- [Results](#results)
- [Todo](#todo)
- [Reference](#reference)
## Installation
* Supported Platforms: Linux
* Accelerators and GPUs: NVIDIA GPUs via CUDA driver ***10.1*** or ***10.2***.
* Dependencies:
* python >= 3.5, < 4
* llvmlite >= 0.31.0
* pytorch >= 1.5
* numba >= 0.42.0
* numpy >= 1.18.1
* Install nnieqat via pypi:
```shell
$ pip install nnieqat
```
* Install nnieqat in docker(easy way to solve environment problems):
```shell
$ cd docker
$ docker build -t nnieqat-image .
```
* Install nnieqat via repo:
```shell
$ git clone https://github.com/aovoc/nnieqat-pytorch
$ cd nnieqat-pytorch
$ make install
```
## Usage
* add quantization hook.
quantize and dequantize weight and data with HiSVP GFPQ library in forward() process.
```python
from nnieqat import quant_dequant_weight, unquant_weight, merge_freeze_bn, register_quantization_hook
...
...
register_quantization_hook(model)
...
```
* merge bn weight into conv and freeze bn
suggest finetuning from a well-trained model, merge_freeze_bn at beginning. do it after a few epochs of training otherwise.
```python
from nnieqat import quant_dequant_weight, unquant_weight, merge_freeze_bn, register_quantization_hook
...
...
model.train()
model = merge_freeze_bn(model) #it will change bn to eval() mode during training
...
```
* Unquantize weight before update it
```python
from nnieqat import quant_dequant_weight, unquant_weight, merge_freeze_bn, register_quantization_hook
...
...
model.apply(unquant_weight) # using original weight while updating
optimizer.step()
...
```
* Dump weight optimized model
```python
from nnieqat import quant_dequant_weight, unquant_weight, merge_freeze_bn, register_quantization_hook
...
...
model.apply(quant_dequant_weight)
save_checkpoint(...)
model.apply(unquant_weight)
...
```
* Using EMA with caution(Not recommended).
## Code Examples
* [Cifar10 quantization aware training example][cifar10_qat] (add nnieqat into [pytorch_cifar10_tutorial][cifar10_example])
```python test/test_cifar10.py```
* [ImageNet quantization finetuning example][imagenet_qat] (add nnieqat into [pytorh_imagenet_main.py][imagenet_example])
```python test/test_imagenet.py --pretrained path_to_imagenet_dataset```
## Results
* ImageNet
```
python test/test_imagenet.py /data/imgnet/ --arch squeezenet1_1 --lr 0.001 --pretrained --epoch 10 # nnie_lr_e-3_ft
python pytorh_imagenet_main.py /data/imgnet/ --arch squeezenet1_1 --lr 0.0001 --pretrained --epoch 10 # lr_e-4_ft
python test/test_imagenet.py /data/imgnet/ --arch squeezenet1_1 --lr 0.0001 --pretrained --epoch 10 # nnie_lr_e-4_ft
```
finetune result:
| | trt_fp32 | trt_int8 | nnie |
| -------- | -------- | -------- | -------- |
| torchvision | 0.56992 | 0.56424 | 0.56026 |
| nnie_lr_e-3_ft | 0.56600 | 0.56328 | 0.56612 |
| lr_e-4_ft | 0.57884 | 0.57502 | 0.57542 |
| nnie_lr_e-4_ft | 0.57834 | 0.57524 | 0.57730 |
* coco
net: simplified yolov5s
train 300 epoches, hi3559 test result:
Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.338
Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.540
Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.357
Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.187
Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.377
Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.445
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.284
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.484
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.542
Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.357
Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.595
Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.679
finetune 20 epoches, hi3559 test result:
Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.339
Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.539
Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.360
Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.191
Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.378
Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.446
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.285
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.485
Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.544
Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.361
Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.596
Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.683
## Todo
* Generate quantized model directly.
## Reference
HiSVP 量化库使用指南
[Quantizing deep convolutional networks for efficient inference: A whitepaper][quant_whitepaper]
[8-bit Inference with TensorRT][trt_quant]
[Distilling the Knowledge in a Neural Network][distillingNN]
[cifar10_qat]: https://github.com/aovoc/nnieqat-pytorch/blob/master/test/test_cifar10.py
[imagenet_qat]: https://github.com/aovoc/nnieqat-pytorch/blob/master/test/test_imagenet.py
[imagenet_example]: https://github.com/pytorch/examples/blob/master/imagenet/main.py
[cifar10_example]: https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
[quant_whitepaper]: https://arxiv.org/abs/1806.08342
[trt_quant]: https://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
[distillingNN]: https://arxiv.org/abs/1503.02531
================================================
FILE: build_helper.py
================================================
import os
import shutil
import subprocess
import sys
import tempfile
from distutils import ccompiler
def print_warning(*lines):
print('**************************************************')
for line in lines:
print('*** WARNING: %s' % line)
print('**************************************************')
def get_path(key):
return os.environ.get(key, '').split(os.pathsep)
def search_on_path(filenames):
for p in get_path('PATH'):
for filename in filenames:
full = os.path.join(p, filename)
if os.path.exists(full):
return os.path.abspath(full)
return None
minimum_cuda_version = 10010
maxinum_cuda_version = 10030
minimum_cudnn_version = 7000
def get_compiler_setting():
nvcc_path = search_on_path(('nvcc', 'nvcc.exe'))
cuda_path_default = None
if nvcc_path is None:
print_warning('nvcc not in path.', 'Please set path to nvcc.')
else:
cuda_path_default = os.path.normpath(
os.path.join(os.path.dirname(nvcc_path), '..'))
cuda_path = os.environ.get('CUDA_PATH', '') # Nvidia default on Windows
if len(cuda_path) > 0 and cuda_path != cuda_path_default:
print_warning('nvcc path != CUDA_PATH',
'nvcc path: %s' % cuda_path_default,
'CUDA_PATH: %s' % cuda_path)
if not os.path.exists(cuda_path):
cuda_path = cuda_path_default
if not cuda_path and os.path.exists('/usr/local/cuda'):
cuda_path = '/usr/local/cuda'
include_dirs = []
library_dirs = []
define_macros = []
if cuda_path:
include_dirs.append(os.path.join(cuda_path, 'include'))
if sys.platform == 'win32':
library_dirs.append(os.path.join(cuda_path, 'bin'))
library_dirs.append(os.path.join(cuda_path, 'lib', 'x64'))
else:
library_dirs.append(os.path.join(cuda_path, 'lib64'))
library_dirs.append(os.path.join(cuda_path, 'lib'))
if sys.platform == 'darwin':
library_dirs.append('/usr/local/cuda/lib')
return {
'include_dirs': include_dirs,
'library_dirs': library_dirs,
'define_macros': define_macros,
'language': 'c++',
}
def check_cuda_version():
compiler = ccompiler.new_compiler()
settings = get_compiler_setting()
try:
out = build_and_run(compiler,
'''
#include
#include
int main(int argc, char* argv[]) {
printf("%d", CUDA_VERSION);
return 0;
}
''',
include_dirs=settings['include_dirs'])
except Exception as e:
print_warning('Cannot check CUDA version', str(e))
return False
cuda_version = int(out)
if cuda_version < minimum_cuda_version:
print_warning('CUDA version is too old: %d' % cuda_version,
'CUDA v10.1 or CUDA v10.2 is required')
return False
if cuda_version > maxinum_cuda_version:
print_warning('CUDA version is too new: %d' % cuda_version,
'CUDA v10.1 or CUDA v10.2 is required')
return True
def check_cudnn_version():
compiler = ccompiler.new_compiler()
settings = get_compiler_setting()
try:
out = build_and_run(compiler,
'''
#include
#include
int main(int argc, char* argv[]) {
printf("%d", CUDNN_VERSION);
return 0;
}
''',
include_dirs=settings['include_dirs'])
except Exception as e:
print_warning('Cannot check cuDNN version\n{0}'.format(e))
return False
cudnn_version = int(out)
if cudnn_version < minimum_cudnn_version:
print_warning('cuDNN version is too old: %d' % cudnn_version,
'cuDNN v7 or newer is required')
return False
return True
def build_and_run(compiler,
source,
libraries=(),
include_dirs=(),
library_dirs=()):
temp_dir = tempfile.mkdtemp()
try:
fname = os.path.join(temp_dir, 'a.cpp')
with open(fname, 'w') as f:
f.write(source)
objects = compiler.compile([fname],
output_dir=temp_dir,
include_dirs=include_dirs)
try:
postargs = ['/MANIFEST'] if sys.platform == 'win32' else []
compiler.link_executable(objects,
os.path.join(temp_dir, 'a'),
libraries=libraries,
library_dirs=library_dirs,
extra_postargs=postargs,
target_lang='c++')
except Exception as e:
msg = 'Cannot build a stub file.\nOriginal error: {0}'.format(e)
raise Exception(msg)
try:
out = subprocess.check_output(os.path.join(temp_dir, 'a'))
return out
except Exception as e:
msg = 'Cannot execute a stub file.\nOriginal error: {0}'.format(e)
raise Exception(msg)
finally:
shutil.rmtree(temp_dir, ignore_errors=True)
================================================
FILE: docker/Dockerfile
================================================
ARG PYTORCH="1.6.0"
ARG CUDA="10.1"
ARG CUDNN="7"
FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0+PTX"
ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
RUN apt-get update && apt-get install -y git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# Install nnieqat
RUN pip install nnieqat
WORKDIR /root/
================================================
FILE: docs/Makefile
================================================
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
================================================
FILE: docs/make.bat
================================================
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd
================================================
FILE: docs/source/build_helper.rst
================================================
build\_helper module
====================
.. automodule:: build_helper
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/source/conf.py
================================================
# -*- coding: utf-8 -*-
#
import os
import sys
sys.path.insert(0, os.path.abspath('./../../'))
# -- Project information -----------------------------------------------------
project = 'nnieqat'
copyright = '2020, Minqin Chen'
author = 'Minqin Chen'
# The short X.Y version
version = ''
# The full version, including alpha/beta/rc tags
release = '0.1.0'
# -- General configuration ---------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.todo',
'sphinx.ext.githubpages',
'sphinx.ext.autodoc',
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
source_suffix = '.rst'
# The master toctree document.
master_doc = 'index'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path .
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#
# html_theme_options = {}
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
#
# The default sidebars (for documents that don't match any pattern) are
# defined by theme itself. Builtin themes are using these templates by
# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
# 'searchbox.html']``.
#
# html_sidebars = {}
html_theme = 'sphinx_rtd_theme'
================================================
FILE: docs/source/index.rst
================================================
.. nnieqat documentation master file, created by
sphinx-quickstart on Fri Aug 21 03:52:34 2020.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Welcome to nnieqat's documentation!
===================================
.. toctree::
:maxdepth: 2
:caption: Contents:
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
================================================
FILE: docs/source/modules.rst
================================================
nnieqat
=======
.. toctree::
:maxdepth: 4
nnieqat
================================================
FILE: docs/source/nnieqat.cuda10.rst
================================================
nnieqat.cuda10 package
======================
Submodules
----------
nnieqat.cuda10.quantize module
------------------------------
.. automodule:: nnieqat.cuda10.quantize
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: nnieqat.cuda10
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/source/nnieqat.modules.rst
================================================
nnieqat.modules package
=======================
Submodules
----------
nnieqat.modules.conv module
---------------------------
.. automodule:: nnieqat.modules.conv
:members:
:undoc-members:
:show-inheritance:
nnieqat.modules.linear module
-----------------------------
.. automodule:: nnieqat.modules.linear
:members:
:undoc-members:
:show-inheritance:
nnieqat.modules.pooling module
------------------------------
.. automodule:: nnieqat.modules.pooling
:members:
:undoc-members:
:show-inheritance:
Module contents
---------------
.. automodule:: nnieqat.modules
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/source/nnieqat.rst
================================================
nnieqat package
===============
Subpackages
-----------
.. toctree::
nnieqat.cuda10
nnieqat.gpu
nnieqat.modules
Module contents
---------------
.. automodule:: nnieqat
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: docs/source/setup.rst
================================================
setup module
============
.. automodule:: setup
:members:
:undoc-members:
:show-inheritance:
================================================
FILE: nnieqat/__init__.py
================================================
""" quantize aware training package for Neural Network Inference Engine(NNIE) on pytorch.
"""
import sys
try:
from .quantize import quant_dequant_weight, unquant_weight, freeze_bn, \
merge_freeze_bn, register_quantization_hook, test
except:
raise
__all__ = [
"quant_dequant_weight", "unquant_weight", "freeze_bn", "merge_freeze_bn", \
"register_quantization_hook", "test"]
test()
================================================
FILE: nnieqat/cuda10/LICENSE.txt
================================================
/*
* Copyright (c) 2018, Hisilicon Limited
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
================================================
FILE: nnieqat/quantize.py
================================================
#!/usr/bin/env python
"""Quantize function.
"""
import ctypes
import datetime
import logging
from os.path import abspath, dirname
import torch
import numpy as np
from numba import cuda
from quant_impl import fake_quantize
_USE_GFPQ_QUANT_LIB = (torch.cuda.device_count() <= 1)
class GFPQParamSt(ctypes.Structure):
r"""GFPQ param, corresponds with struct GFPQ_PARAM_ST in gfpq.hpp"""
_fields_ = [("mode", ctypes.c_int), ("param", ctypes.c_byte * 16)]
class _types:
r"""Some alias types."""
handle = ctypes.c_void_p
stream = ctypes.c_void_p
class QuantAndDeQuantGPU():
r"""quantize and dequantize data with GFPG library.
"""
def __init__(self,
libquant_path=dirname(abspath(__file__)) +
"/gpu/lib/libgfpq_gpu.so",
libcublas_path="libcublas.so",
bit_width=8,
param_mode=0):
global _USE_GFPQ_QUANT_LIB
self._bit_width = bit_width
if _USE_GFPQ_QUANT_LIB:
self._libquant = ctypes.cdll.LoadLibrary(libquant_path)
self._libcublas = ctypes.cdll.LoadLibrary(libcublas_path)
self._libcublas.cublasCreate_v2.restype = int
self._libcublas.cublasCreate_v2.argtypes = [ctypes.c_void_p]
self._cublas_handle = _types.handle()
self._libcublas.cublasCreate_v2(ctypes.byref(self._cublas_handle))
self._param = GFPQParamSt()
self._stream = cuda.stream()
self._param.mode = param_mode
def __call__(self, tensor, mode=0):
r""" Converts float weights to quantized weights.
Args:
- tensor: input data
- mode: GFPQ mode for param
GFPQ_MODE_INIT(0): There is no valid parameter in param[].
Generate the parameter and filled in param[].
GFPQ_MODE_UPDATE(1): There is parameter in param[]. Generate
new parameter, update param[] when the new parameter is
better.
GFPQ_MODE_APPLY_ONLY(2): There is parameter in param[]. Don't
generate parameter. Just use the param[].
"""
global _USE_GFPQ_QUANT_LIB
if _USE_GFPQ_QUANT_LIB:
try:
if isinstance(tensor, tuple):
for tensor_item in tensor:
data_cuda_array = cuda.as_cuda_array(
tensor_item.data.detach())
data_p = data_cuda_array.device_ctypes_pointer
self._param.mode = mode
ret = self._libquant.HI_GFPQ_QuantAndDeQuant_GPU_PY(
data_p, data_cuda_array.size, self._bit_width,
ctypes.byref(self._param), self._stream.handle,
self._cublas_handle)
else:
data_cuda_array = cuda.as_cuda_array(tensor.data.detach())
data_p = data_cuda_array.device_ctypes_pointer
self._param.mode = mode
ret = self._libquant.HI_GFPQ_QuantAndDeQuant_GPU_PY(
data_p, data_cuda_array.size, self._bit_width,
ctypes.byref(self._param), self._stream.handle,
self._cublas_handle)
except:
pass
finally:
if ret != 0:
_USE_GFPQ_QUANT_LIB = False
logger = logging.getLogger(__name__)
logger.setLevel(logging.WARNING)
logger.warning(
"""Failed to quantize data with default HiSVP GFPQ library,
Use implemented quantization algorithm instead.""")
if isinstance(tensor, tuple):
for tensor_item in tensor:
tensor_item.data = fake_quantize(
tensor_item.data.detach().clone(), self._bit_width)
else:
tensor.data = fake_quantize(tensor.data.detach().clone(),
self._bit_width)
else:
if isinstance(tensor, tuple):
for tensor_item in tensor:
tensor_item.data = fake_quantize(tensor_item.data.detach().clone(),
self._bit_width)
else:
tensor.data = fake_quantize(tensor.data.detach().clone(),
self._bit_width)
return tensor
_QUANT_HANDLE = QuantAndDeQuantGPU()
def _fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b):
""" fuse convolution and batch norm's weight.
Args:
conv_w (torch.nn.Parameter): convolution weight.
conv_b (torch.nn.Parameter): convolution bias.
bn_rm (torch.nn.Parameter): batch norm running mean.
bn_rv (torch.nn.Parameter): batch norm running variance.
bn_eps (torch.nn.Parameter): batch norm epsilon.
bn_w (torch.nn.Parameter): batch norm weight.
bn_b (torch.nn.Parameter): batch norm weight.
Returns:
conv_w(torch.nn.Parameter): fused convolution weight.
conv_b(torch.nn.Parameter): fused convllution bias.
"""
if conv_b is None:
conv_b = bn_rm.new_zeros(bn_rm.shape)
bn_var_rsqrt = torch.rsqrt(bn_rv + bn_eps)
conv_w = conv_w * \
(bn_w * bn_var_rsqrt).reshape([-1] + [1] * (len(conv_w.shape) - 1))
conv_b = (conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b
return torch.nn.Parameter(conv_w), torch.nn.Parameter(conv_b)
def _fuse_conv_bn(conv, bn):
conv.weight, conv.bias = \
_fuse_conv_bn_weights(conv.weight, conv.bias,
bn.running_mean, bn.running_var, bn.eps, bn.weight, bn.bias)
return conv
def _fuse_modules(model):
r"""Fuses a list of modules into a single module
Fuses only the following sequence of modules:
conv, bn
All other sequences are left unchanged.
For these sequences, fuse modules on weight level, keep model structure unchanged.
Arguments:
model: Model containing the modules to be fused
Returns:
model with fused modules.
"""
children = list(model.named_children())
conv_module = None
conv_name = None
for name, child in children:
if isinstance(child, (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d,
torch.nn.BatchNorm3d)):
if isinstance(conv_module, (torch.nn.Conv2d, torch.nn.Conv3d)):
conv_module = _fuse_conv_bn(conv_module, child)
model._modules[conv_name] = conv_module
child.eval()
child.running_mean = child.running_mean.new_full(
child.running_mean.shape, 0)
child.running_var = child.running_var.new_full(
child.running_var.shape, 1)
if child.weight is not None:
child.weight.data = child.weight.data.new_full(
child.weight.shape, 1)
if child.bias is not None:
child.bias.data = child.bias.data.new_full(
child.bias.shape, 0)
child.track_running_stats = False
child.momentum = 0
child.eps = 0
conv_module = None
elif isinstance(child, (torch.nn.Conv2d, torch.nn.Conv3d)):
conv_module = child
conv_name = name
else:
_fuse_modules(child)
return model
def freeze_bn(m, freeze_bn_affine=True):
"""Freeze batch normalization.
reference: https://arxiv.org/abs/1806.08342
Args:
- m (nn.module): torch module
- freeze_bn_affine (bool, optional): Freeze affine scale and
translation factor or not. Defaults: True.
"""
if isinstance(
m,
(torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d)):
m.eval()
if freeze_bn_affine:
m.weight.requires_grad = False
m.bias.requires_grad = False
def merge_freeze_bn(model):
"""merge batch norm's weight into convolution, then freeze it.
Args:
model (nn.module): model.
Returns:
[nn.module]: model.
"""
model = _fuse_modules(model)
model.apply(freeze_bn)
return model
def unquant_weight(m):
""" unquantize weight before update weight, avoid training turbulence.
Args:
- m (nn.module): torch module.
"""
try:
if hasattr(m, "weight_origin") and m.weight is not None:
m.weight.data.copy_(m.weight_origin.data)
except AttributeError:
pass
except TypeError:
pass
def quant_dequant_weight(m):
""" quant weight manually.
Args:
- m (nn.module): torch module.
"""
global _QUANT_HANDLE
global _USE_GFPQ_QUANT_LIB
quant_handle = _QUANT_HANDLE
if not _USE_GFPQ_QUANT_LIB:
quant_handle = QuantAndDeQuantGPU()
try:
if hasattr(m, "weight_origin") and m.weight is not None:
m.weight_origin.data.copy_(m.weight.data)
m.weight.data = quant_handle(m.weight.data.detach().clone())
except AttributeError:
pass
except TypeError:
pass
def _quantizing_activation(module, input, output):
if isinstance(
module,
(torch.nn.ReLU, torch.nn.ELU, torch.nn.LeakyReLU, torch.nn.PReLU)):
global _QUANT_HANDLE
global _USE_GFPQ_QUANT_LIB
quant_handle = _QUANT_HANDLE
if not _USE_GFPQ_QUANT_LIB:
quant_handle = QuantAndDeQuantGPU()
# print("quantizing activation.")
# print(output[0][0][0])
output_type = output.dtype
module.activation_max_value = torch.max(torch.max(torch.abs(output.detach())), module.activation_max_value.to(output_type))
# print(module.activation_max_value)
tensor_t = torch.cat((output, torch.ones(output[0].shape).cuda().unsqueeze(0) * module.activation_max_value))
output.data = quant_handle(tensor_t.float())[:-1]
output = output.to(output_type)
# print(output[0][0][0])
def _quantizing_data(module, input):
global _QUANT_HANDLE
global _USE_GFPQ_QUANT_LIB
quant_handle = _QUANT_HANDLE
if not _USE_GFPQ_QUANT_LIB:
quant_handle = QuantAndDeQuantGPU()
# print("quantizing data.")
# print(input[0][0][0])
# print("quantizing data.")
# print(input[0][0][0])
# input_type = input.dtype
if isinstance(input, tuple):
for item in input:
item_type = item.dtype
item = quant_handle(item.float())
item.to(item_type)
else:
input = quant_handle(input.float())
# input = input.to(input_type)
# print(input[0][0][0])
def _quantizing_weight(module, input):
global _QUANT_HANDLE
global _USE_GFPQ_QUANT_LIB
quant_handle = _QUANT_HANDLE
if not _USE_GFPQ_QUANT_LIB:
quant_handle = QuantAndDeQuantGPU()
# print("quantizing weight.")
# print(module.weight[0][0][0])
module.weight_origin.data.copy_(module.weight.data)
module.weight.data = quant_handle(module.weight.data.detach().clone())
# print(module.weight[0][0][0])
def register_quantization_hook(model,
quant_weight=True,
quant_activation=True,
quant_data=False):
"""register quantization hook for model.
Args:
model (:class:`Module`): Module.
Returns:
Module: self
"""
# weight quantizing.
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
for _, module in model._modules.items():
if len(list(module.children())) > 0:
register_quantization_hook(module, quant_weight, quant_activation)
else:
if quant_weight and hasattr(
module,
"weight") and module.weight is not None and not isinstance(
module, (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d,
torch.nn.BatchNorm3d)):
module.register_buffer('weight_origin', module.weight.detach().clone())
if quant_data:
module.register_forward_pre_hook(_quantizing_data)
logger.info("Quantizing input data of %s", str(module))
module.register_forward_pre_hook(_quantizing_weight)
logger.info("Quantizing weight of %s", str(module))
if quant_activation and isinstance(
module, (torch.nn.ReLU, torch.nn.ELU, torch.nn.LeakyReLU, torch.nn.PReLU)):
module.register_buffer("activation_max_value", torch.tensor(0, dtype=torch.float).cuda())
module.register_forward_hook(_quantizing_activation)
logger.info("Quantizing activation of %s", str(module))
return model
def test():
r""" Test GFPG library QuantAndDeQuantGPU.
"""
quant_handle = QuantAndDeQuantGPU()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
tensor = torch.Tensor(np.array([-9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])).cuda()
logging.info("Origin Data: ")
logging.info(tensor)
start_time = datetime.datetime.now()
quant_tensor = quant_handle(tensor)
end_time = datetime.datetime.now()
logging.info("Quant Data: ")
logging.info(quant_tensor)
data_expected = np.array([
-8.7240619659, 0.0000000000, 1.0000000000, 2.0000000000, 2.9536523819,
4.0000000000, 4.9674310684, 5.9073047638, 7.0250086784, 8.0000000000,
8.7240619659
])
logging.info("Data expected: ")
logging.info(" ".join([str(v) for v in data_expected]))
data_diff = quant_tensor.data.detach().cpu().numpy() - data_expected
flag = "success."
for num in data_diff:
if abs(num) > 0.000000001:
flag = "failed."
run_time = end_time - start_time
logging.info("QuantAndDeQuantGPU time: %s", str(run_time))
logging.info("QuantAndDeQuantGPU %s", flag)
================================================
FILE: pyproject.toml
================================================
[build-system]
requires = ["setuptools>=40.8.0", "wheel"]
build-backend = "setuptools.build_meta"
================================================
FILE: setup.cfg
================================================
[metadata]
license_files = LICENSE.txt
================================================
FILE: setup.py
================================================
from setuptools import setup, find_packages
import pathlib
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
from build_helper import check_cuda_version
assert(check_cuda_version())
import os
os.system('make -j%d' % os.cpu_count())
here = pathlib.Path(__file__).parent.resolve()
long_description = (here / 'README.md').read_text(encoding='utf-8')
setup(
name='nnieqat',
version='0.1.0',
description='A nnie quantization aware training tool on pytorch.',
long_description=long_description,
long_description_content_type='text/markdown',
url='https://github.com/aovoc/nnieqat-pytorch',
author='Minqin Chen',
author_email='minqinchen@deepglint.com',
license='MIT',
classifiers=[
'Development Status :: 5 - Production/Stable',
"Intended Audience :: Science/Research",
'Intended Audience :: Developers',
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Software Development :: Libraries :: Python Modules",
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3 :: Only',
],
keywords=[
"quantization aware training",
"deep learning",
"neural network",
"CNN",
"machine learning",
],
packages=find_packages(),
package_data={
"nnieqat": ["gpu/lib/*gfpq*"],
},
python_requires='>=3.5, <4',
install_requires=[
"torch>=1.5",
"numba>=0.42.0",
"numpy>=1.18.1"
],
extras_require={
'test': ["torchvision>=0.4",
"nose",
"ddt"
],
'docs': [
'sphinx==2.4.4',
'sphinx_rtd_theme'
]
},
ext_modules=[
CUDAExtension(
name="quant_impl",
sources=[
"./src/fake_quantize.cpp",
],
libraries=['quant_impl'],
library_dirs=['obj'],
)
],
cmdclass={'build_ext': BuildExtension},
test_suite="nnieqat.test.test_cifar10",
)
================================================
FILE: src/fake_quantize.cpp
================================================
#include "fake_quantize.h"
#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
Tensor fake_quantize(Tensor a, int bit_width){
CHECK_INPUT(a);
return fake_quantize_cuda(a, bit_width);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){
m.def("fake_quantize", &fake_quantize, "NNIE Fake Quantization (CUDA)");
}
================================================
FILE: src/fake_quantize.cu
================================================
#include "fake_quantize.h"
__global__ void fake_quantize_kernel_cuda(float* __restrict__ a,
float* o, int size,
float* max_entry,
int bit_width) {
if(bit_width!=8) bit_width =16;
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < size) {
if((*max_entry) < 1e-15 && (*max_entry) > -1e-15){
o[index] = 0;
return;
}
if(bit_width == 8){
float data_max = (*max_entry);
int max_entry_qdata_int = floorf(__log2f(data_max) * 16) + 1;
data_max = __powf(2, __fdividef(max_entry_qdata_int, 16));
float data_max_floor = __powf(2, __fdividef(max_entry_qdata_int-1, 16));
if(a[index] <= data_max_floor * 0.0020395972313035 // exp(ln(256) / 128) / 512= 2^(1/16-9) = 1.0442737824274 /512 = 0.0020395972313035
&& a[index] > - data_max * 0.0020395972313035){
o[index] = 0;
return;
}
//int qdata_int = (int)(log(256 * a[index] / data_max ) / 0.04332169878499658); //ln(256) / 128 = 0.04332169878499658
int qdata_int = 0;
if(a[index] > 0){
qdata_int = rintf(__fdividef( __logf(__fdividef(256* a[index],data_max)), 0.04332169878499658)); //ln(256) / 128 = 0.04332169878
if(qdata_int > 127) qdata_int = 127;
else if(qdata_int < 0) qdata_int = 0;
o[index] = __fdividef(data_max , 256.0) * __expf(qdata_int*0.04332169878499658);
}
else{
qdata_int = - rintf(__fdividef( __logf(__fdividef(- 256* a[index], data_max)), 0.04332169878499658)); //ln(256) / 128 = 0.04332169878
if(qdata_int < -127) qdata_int = -127;
else if(qdata_int >-1) qdata_int = -1;
o[index] = - __fdividef(data_max , 256.0) * __expf(- qdata_int*0.04332169878499658);
}
}
else{
float data_max = (*max_entry);
int max_entry_qdata_int = floorf(__log2f(data_max) * 128) + 1;
data_max = __powf(2, __fdividef(max_entry_qdata_int, 128));
float data_max_floor = __powf(2, __fdividef(max_entry_qdata_int-1, 16));
if(a[index] < data_max_floor *0.0019537861485404 //exp(ln(2^16)/(2^15)) / 512 = 0.0019537861485404
&& a[index] > - data_max * 0.0019537861485404){
o[index] = 0;
return;
}
int qdata_int = 0;
if(a[index] > 0){
qdata_int = rintf(__fdividef( __logf(__fdividef(65536* a[index], data_max)), 0.00033845077175779));
if(qdata_int > 32767) qdata_int = 32767;
else if(qdata_int <0) qdata_int = 0;
o[index] = __fdividef(data_max , 65536.0) * __expf(qdata_int * 0.00033845077175779);
}
else{
qdata_int = - rintf(__fdividef( __logf(__fdividef(- 65536* a[index], data_max)), 0.00033845077175779));
if(qdata_int < -32767) qdata_int = -32767;
else if(qdata_int >-1) qdata_int = -1;
o[index] = - __fdividef(data_max , 65536.0) * __expf(- qdata_int * 0.00033845077175779);
}
}
}
}
Tensor fake_quantize_cuda(Tensor a, int bit_width) {
auto o = at::zeros_like(a);
int64_t size = a.numel();
Tensor max_entry = at::max(at::abs(a));
int blockSize = 1024;
int blockNums = (size + blockSize - 1) / blockSize;
fake_quantize_kernel_cuda<<>>(a.data_ptr(),
o.data_ptr(),
size,
max_entry.data_ptr(),
bit_width);
return o;
}
================================================
FILE: src/fake_quantize.h
================================================
#include
#include
#include
#include
#include
#include
#include
#include
#include
using namespace at;
Tensor fake_quantize(Tensor a, int bit_width=8);
Tensor fake_quantize_cuda(Tensor a, int bit_width=8);
__global__ void fake_quantize_kernel_cuda(float* __restrict__ a,
float* o, int size,
float* max_entry,
int bit_width=8);
================================================
FILE: src/test/Makefile
================================================
# Uncomment for debugging
DEBUG := 1
# Pretty build
# Q ?= @
CXX := g++
python := python3
PYTHON_HEADER_DIR := $(shell python -c 'from distutils.sysconfig import get_python_inc; print(get_python_inc())')
PYTORCH_INCLUDES := $(shell python -c 'from torch.utils.cpp_extension import include_paths; [print(p) for p in include_paths()]')
PYTORCH_LIBRARIES := $(shell python -c 'from torch.utils.cpp_extension import library_paths; [print(p) for p in library_paths()]')
CUDA_DIR := $(shell python -c 'from torch.utils.cpp_extension import _find_cuda_home; print(_find_cuda_home())')
WITH_ABI := $(shell python -c 'import torch; print(int(torch._C._GLIBCXX_USE_CXX11_ABI))')
INCLUDE_DIRS := ./ $(CUDA_DIR)/include
INCLUDE_DIRS += $(PYTHON_HEADER_DIR)
INCLUDE_DIRS += $(PYTORCH_INCLUDES)
# Custom (MKL/ATLAS/OpenBLAS) include and lib directories.
# BLAS_INCLUDE := /path/to/your/blas
# BLAS_LIB := /path/to/your/blas
SRC_DIR := ./
OBJ_DIR := ./obj
CPP_SRCS := $(wildcard $(SRC_DIR)/*.cpp)
CU_SRCS := $(wildcard $(SRC_DIR)/*.cu)
OBJS := $(patsubst $(SRC_DIR)/%.cpp,$(OBJ_DIR)/%.o,$(CPP_SRCS))
CU_OBJS := $(patsubst $(SRC_DIR)/%.cu,$(OBJ_DIR)/cuda/%.o,$(CU_SRCS))
STATIC_LIB := $(OBJ_DIR)/libquant_impl.a
CUDA_ARCH := -gencode arch=compute_50,code=sm_50 \
-gencode arch=compute_52,code=sm_52 \
-gencode arch=compute_60,code=sm_60 \
-gencode arch=compute_61,code=sm_61 \
-gencode arch=compute_70,code=sm_70 \
-gencode arch=compute_75,code=sm_75 \
-gencode arch=compute_75,code=compute_75
LIBRARIES += stdc++ cudart c10 caffe2 torch torch_python caffe2_gpu
ifeq ($(DEBUG), 1)
COMMON_FLAGS += -DDEBUG -g -O0
NVCCFLAGS += -g -G # -rdc true
else
COMMON_FLAGS += -DNDEBUG -O3
endif
WARNINGS := -Wall -Wno-sign-compare -Wcomment
INCLUDE_DIRS += $(BLAS_INCLUDE)
CXXFLAGS += -MMD -MP
COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) \
-DTORCH_API_INCLUDE_EXTENSION_H -D_GLIBCXX_USE_CXX11_ABI=$(WITH_ABI)
CXXFLAGS += -pthread -fPIC -fwrapv -std=c++14 $(COMMON_FLAGS) $(WARNINGS)
NVCCFLAGS += -std=c++14 -ccbin=$(CXX) -Xcompiler -fPIC -use_fast_math $(COMMON_FLAGS)
default: $(STATIC_LIB)
$(OBJ_DIR):
@ mkdir -p $@
@ mkdir -p $@/cuda
$(OBJ_DIR)/%.o: $(SRC_DIR)/%.cpp | $(OBJ_DIR)
@ echo CXX $<
$(Q)$(CXX) $< $(CXXFLAGS) -c -o $@
$(OBJ_DIR)/cuda/%.o: $(SRC_DIR)/%.cu | $(OBJ_DIR)
@ echo NVCC $<
$(Q)nvcc $(NVCCFLAGS) $(CUDA_ARCH) -M $< -o ${@:.o=.d} \
-odir $(@D)
$(Q)nvcc $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@
$(STATIC_LIB): $(OBJS) $(CU_OBJS) | $(OBJ_DIR)
$(RM) -f $(STATIC_LIB)
$(RM) -rf build dist
@ echo LD -o $@
ar rc $(STATIC_LIB) $(OBJS) $(CU_OBJS)
build:
$(python) setup.py build
upload:
$(python) setup.py sdist bdist_wheel
#twine upload dist/*
clean:
$(RM) -rf build dist nnieqat.egg-info obj
test:
nosetests -s tests/test_quant_impl.py --nologcapture
lint:
pylint nnieqat --reports=n
lintfull:
pylint nnieqat
install:
$(python) setup.py install
uninstall:
$(python) setup.py install --record install.log
cat install.log | xargs rm -rf
$(RM) install.log
================================================
FILE: src/test/test.cu
================================================
#include
#include "../fake_quantize.h"
int main(int argc, char *argv[])
{
Tensor input = randn({2, 2});
fake_quantize(input, 8);
return 0;
}
================================================
FILE: tests/test_cifar10.py
================================================
# -*- coding:utf-8 -*-
from nnieqat import quant_dequant_weight, unquant_weight, merge_freeze_bn, register_quantization_hook
import unittest
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import torchvision
import torchvision.transforms as transforms
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = torch.nn.Conv2d(3, 6, 5)
self.pool = torch.nn.MaxPool2d(2, 2)
self.conv2 = torch.nn.Conv2d(6, 16, 5)
self.fc1 = torch.nn.Linear(16 * 5 * 5, 120)
self.fc2 = torch.nn.Linear(120, 84)
self.fc3 = torch.nn.Linear(84, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 16 * 5 * 5)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
class TestCifar10(unittest.TestCase):
def test(self):
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
trainset = torchvision.datasets.CIFAR10(root='./data',
train=True,
download=True,
transform=transform)
trainloader = torch.utils.data.DataLoader(trainset,
batch_size=4,
shuffle=True,
num_workers=2)
testset = torchvision.datasets.CIFAR10(root='./data',
train=False,
download=True,
transform=transform)
testloader = torch.utils.data.DataLoader(testset,
batch_size=4,
shuffle=True,
num_workers=2)
dataiter = iter(trainloader)
images, labels = dataiter.next()
net = Net()
register_quantization_hook(net)
net.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
print("Cifar10 training:")
for epoch in range(5):
net.train()
if epoch > 2:
net = merge_freeze_bn(net)
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
inputs, labels = data
inputs, labels = Variable(inputs.cuda()), Variable(
labels.cuda())
optimizer.zero_grad()
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
net.apply(unquant_weight)
optimizer.step()
running_loss += loss.item()
if i % 2000 == 1999:
print(' epoch %3d, Iter %5d, loss: %.3f' %
(epoch + 1, i + 1, running_loss / 2000))
running_loss = 0.0
print('Finished Training.')
# net.apply(quant_dequant_weight)
correct = total = 0
for data in testloader:
images, labels = data
outputs = net(Variable(images.cuda()))
_, predicted = torch.max(outputs.data, 1)
correct += (predicted == labels.cuda()).sum()
total += labels.size(0)
print(
'Accuracy(10000 test images, modules\' weight unquantize): %d %%' %
(100.0 * correct / total))
if __name__ == "__main__":
suite = unittest.TestSuite()
suite.addTest(TestCifar10("test"))
runner = unittest.TextTestRunner()
runner.run(suite)
================================================
FILE: tests/test_imagenet.py
================================================
import argparse
import os
import random
import shutil
import time
import warnings
from nnieqat import quant_dequant_weight, unquant_weight, merge_freeze_bn, register_quantization_hook
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
model_names = sorted(name for name in models.__dict__
if name.islower() and not name.startswith("__")
and callable(models.__dict__[name]))
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
parser.add_argument('data', metavar='DIR',
help='path to dataset')
parser.add_argument('-a', '--arch', metavar='ARCH', default='squeezenet1_1',
choices=model_names,
help='model architecture: ' +
' | '.join(model_names) +
' (default: resnet18)')
parser.add_argument('-j', '--workers', default=32, type=int, metavar='N',
help='number of data loading workers (default: 4)')
parser.add_argument('--epochs', default=120, type=int, metavar='N',
help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
help='manual epoch number (useful on restarts)')
parser.add_argument('-b', '--batch-size', default=256, type=int,
metavar='N',
help='mini-batch size (default: 256), this is the total '
'batch size of all GPUs on the current node when '
'using Data Parallel or Distributed Data Parallel')
parser.add_argument('--lr', '--learning-rate', default=0.001, type=float,
metavar='LR', help='initial learning rate', dest='lr')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
help='momentum')
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
metavar='W', help='weight decay (default: 1e-4)',
dest='weight_decay')
parser.add_argument('-p', '--print-freq', default=10, type=int,
metavar='N', help='print frequency (default: 10)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
help='path to latest checkpoint (default: none)')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
help='evaluate model on validation set')
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
help='use pre-trained model')
parser.add_argument('--world-size', default=-1, type=int,
help='number of nodes for distributed training')
parser.add_argument('--rank', default=-1, type=int,
help='node rank for distributed training')
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
help='distributed backend')
parser.add_argument('--seed', default=None, type=int,
help='seed for initializing training. ')
parser.add_argument('--gpu', default=None, type=int,
help='GPU id to use.')
parser.add_argument('--multiprocessing-distributed', action='store_true',
help='Use multi-processing distributed training to launch '
'N processes per node, which has N GPUs. This is the '
'fastest way to use PyTorch for either single node or '
'multi node data parallel training')
best_acc1 = 0
def main():
args = parser.parse_args()
if args.seed is not None:
random.seed(args.seed)
torch.manual_seed(args.seed)
cudnn.deterministic = True
warnings.warn('You have chosen to seed training. '
'This will turn on the CUDNN deterministic setting, '
'which can slow down your training considerably! '
'You may see unexpected behavior when restarting '
'from checkpoints.')
if args.gpu is not None:
warnings.warn('You have chosen a specific GPU. This will completely '
'disable data parallelism.')
if args.dist_url == "env://" and args.world_size == -1:
args.world_size = int(os.environ["WORLD_SIZE"])
args.distributed = args.world_size > 1 or args.multiprocessing_distributed
ngpus_per_node = torch.cuda.device_count()
if args.multiprocessing_distributed:
# Since we have ngpus_per_node processes per node, the total world_size
# needs to be adjusted accordingly
args.world_size = ngpus_per_node * args.world_size
# Use torch.multiprocessing.spawn to launch distributed processes: the
# main_worker process function
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
else:
# Simply call main_worker function
main_worker(args.gpu, ngpus_per_node, args)
def main_worker(gpu, ngpus_per_node, args):
global best_acc1
args.gpu = gpu
if args.gpu is not None:
print("Use GPU: {} for training".format(args.gpu))
if args.distributed:
if args.dist_url == "env://" and args.rank == -1:
args.rank = int(os.environ["RANK"])
if args.multiprocessing_distributed:
# For multiprocessing distributed training, rank needs to be the
# global rank among all the processes
args.rank = args.rank * ngpus_per_node + gpu
dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size, rank=args.rank)
# create model
if args.pretrained:
print("=> using pre-trained model '{}'".format(args.arch))
model = models.__dict__[args.arch](pretrained=True)
else:
print("=> creating model '{}'".format(args.arch))
model = models.__dict__[args.arch]()
register_quantization_hook(model)
if not torch.cuda.is_available():
print('using CPU, this will be slow')
elif args.distributed:
# For multiprocessing distributed, DistributedDataParallel constructor
# should always set the single device scope, otherwise,
# DistributedDataParallel will use all available devices.
if args.gpu is not None:
torch.cuda.set_device(args.gpu)
model.cuda(args.gpu)
# When using a single GPU per process and per
# DistributedDataParallel, we need to divide the batch size
# ourselves based on the total number of GPUs we have
args.batch_size = int(args.batch_size / ngpus_per_node)
args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
else:
model.cuda()
# DistributedDataParallel will divide and allocate batch_size to all
# available GPUs if device_ids are not set
model = torch.nn.parallel.DistributedDataParallel(model)
elif args.gpu is not None:
torch.cuda.set_device(args.gpu)
model = model.cuda(args.gpu)
else:
# DataParallel will divide and allocate batch_size to all available GPUs
if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
model.features = torch.nn.DataParallel(model.features)
model.cuda()
else:
model = torch.nn.DataParallel(model).cuda()
# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss().cuda(args.gpu)
optimizer = torch.optim.SGD(model.parameters(), args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)
# optionally resume from a checkpoint
if args.resume:
if os.path.isfile(args.resume):
print("=> loading checkpoint '{}'".format(args.resume))
if args.gpu is None:
checkpoint = torch.load(args.resume)
else:
# Map model to be loaded to specified single gpu.
loc = 'cuda:{}'.format(args.gpu)
checkpoint = torch.load(args.resume, map_location=loc)
args.start_epoch = checkpoint['epoch']
best_acc1 = checkpoint['best_acc1']
if args.gpu is not None:
# best_acc1 may be from a checkpoint from a different GPU
best_acc1 = best_acc1.to(args.gpu)
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])
print("=> loaded checkpoint '{}' (epoch {})"
.format(args.resume, checkpoint['epoch']))
else:
print("=> no checkpoint found at '{}'".format(args.resume))
cudnn.benchmark = True
# Data loading code
traindir = os.path.join(args.data, 'train')
valdir = os.path.join(args.data, 'val')
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
train_dataset = datasets.ImageFolder(
traindir,
transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize,
]))
if args.distributed:
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
else:
train_sampler = None
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
num_workers=args.workers, pin_memory=True, sampler=train_sampler)
val_loader = torch.utils.data.DataLoader(
datasets.ImageFolder(valdir, transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
])),
batch_size=args.batch_size, shuffle=False,
num_workers=args.workers, pin_memory=True)
if args.evaluate:
validate(val_loader, model, criterion, args)
return
for epoch in range(args.start_epoch, args.epochs):
if args.distributed:
train_sampler.set_epoch(epoch)
adjust_learning_rate(optimizer, epoch, args)
# train for one epoch
train(train_loader, model, criterion, optimizer, epoch, args)
# evaluate on validation set
acc1 = validate(val_loader, model, criterion, args)
# remember best acc@1 and save checkpoint
is_best = acc1 > best_acc1
best_acc1 = max(acc1, best_acc1)
if not args.multiprocessing_distributed or (args.multiprocessing_distributed
and args.rank % ngpus_per_node == 0):
# dump weight quantized model.
model.apply(quant_dequant_weight)
save_checkpoint({
'epoch': epoch + 1,
'arch': args.arch,
'state_dict': model.state_dict(),
'best_acc1': best_acc1,
'optimizer': optimizer.state_dict(),
}, is_best)
model.apply(unquant_weight)
def train(train_loader, model, criterion, optimizer, epoch, args):
batch_time = AverageMeter('Time', ':6.3f')
data_time = AverageMeter('Data', ':6.3f')
losses = AverageMeter('Loss', ':.4e')
top1 = AverageMeter('Acc@1', ':6.2f')
top5 = AverageMeter('Acc@5', ':6.2f')
progress = ProgressMeter(
len(train_loader),
[batch_time, data_time, losses, top1, top5],
prefix="Epoch: [{}]".format(epoch))
# switch to train mode
model.train()
model = merge_freeze_bn(model)
end = time.time()
for i, (images, target) in enumerate(train_loader):
# measure data loading time
data_time.update(time.time() - end)
if args.gpu is not None:
images = images.cuda(args.gpu, non_blocking=True)
if torch.cuda.is_available():
target = target.cuda(args.gpu, non_blocking=True)
# compute output
output = model(images)
loss = criterion(output, target)
# measure accuracy and record loss
acc1, acc5 = accuracy(output, target, topk=(1, 5))
losses.update(loss.item(), images.size(0))
top1.update(acc1[0], images.size(0))
top5.update(acc5[0], images.size(0))
# compute gradient and do SGD step
optimizer.zero_grad()
loss.backward()
model.apply(unquant_weight)
optimizer.step()
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
if i % args.print_freq == 0:
progress.display(i)
def validate(val_loader, model, criterion, args):
batch_time = AverageMeter('Time', ':6.3f')
losses = AverageMeter('Loss', ':.4e')
top1 = AverageMeter('Acc@1', ':6.2f')
top5 = AverageMeter('Acc@5', ':6.2f')
progress = ProgressMeter(
len(val_loader),
[batch_time, losses, top1, top5],
prefix='Test: ')
# switch to evaluate mode
model.eval()
with torch.no_grad():
end = time.time()
for i, (images, target) in enumerate(val_loader):
if args.gpu is not None:
images = images.cuda(args.gpu, non_blocking=True)
if torch.cuda.is_available():
target = target.cuda(args.gpu, non_blocking=True)
# compute output
output = model(images)
loss = criterion(output, target)
# measure accuracy and record loss
acc1, acc5 = accuracy(output, target, topk=(1, 5))
losses.update(loss.item(), images.size(0))
top1.update(acc1[0], images.size(0))
top5.update(acc5[0], images.size(0))
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
if i % args.print_freq == 0:
progress.display(i)
# TODO: this should also be done with the ProgressMeter
print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
.format(top1=top1, top5=top5))
return top1.avg
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
torch.save(state, filename)
if is_best:
shutil.copyfile(filename, 'model_best.pth.tar')
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self, name, fmt=':f'):
self.name = name
self.fmt = fmt
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def __str__(self):
fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
return fmtstr.format(**self.__dict__)
class ProgressMeter(object):
def __init__(self, num_batches, meters, prefix=""):
self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
self.meters = meters
self.prefix = prefix
def display(self, batch):
entries = [self.prefix + self.batch_fmtstr.format(batch)]
entries += [str(meter) for meter in self.meters]
print('\t'.join(entries))
def _get_batch_fmtstr(self, num_batches):
num_digits = len(str(num_batches // 1))
fmt = '{:' + str(num_digits) + 'd}'
return '[' + fmt + '/' + fmt.format(num_batches) + ']'
def adjust_learning_rate(optimizer, epoch, args):
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
lr = args.lr * (0.975 ** (epoch // 3))
for param_group in optimizer.param_groups:
param_group['lr'] = lr
def accuracy(output, target, topk=(1,)):
"""Computes the accuracy over the k top predictions for the specified values of k"""
with torch.no_grad():
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
res.append(correct_k.mul_(100.0 / batch_size))
return res
if __name__ == '__main__':
main()
================================================
FILE: tests/test_merge_freeze_bn.py
================================================
# -*- coding:utf-8 -*-
import unittest
from ddt import ddt, data
import torch
from torch import nn
from nnieqat import merge_freeze_bn, freeze_bn
@ddt
class TestMergeFreezeBNImpl(unittest.TestCase):
def conv_bn(inp,
oup,
stride,
conv_layer=nn.Conv2d,
norm_layer=nn.BatchNorm2d):
return nn.Sequential(conv_layer(inp, oup, 3, stride, 1, bias=False),
norm_layer(oup))
def conv_1x1_bn(inp, oup, conv_layer=nn.Conv2d, norm_layer=nn.BatchNorm2d):
return nn.Sequential(conv_layer(inp, oup, 1, 1, 0, bias=False),
norm_layer(oup))
data1 = conv_bn(3, 3, 2)
data2 = conv_1x1_bn(3, 3)
@data(data1, data2)
def test(self, m):
input = torch.randn(1, 3, 10, 10)
m.eval()
output_0 = m(input)
print("module parameter before merge_freeze_bn: ")
print(list(m.named_parameters()))
m = merge_freeze_bn(m)
m.eval()
output_1 = m(input)
print("module parameter after merge_freeze_bn: ")
print(list(m.named_parameters()))
print("output result before merge_freeze_bn: ")
print(output_0)
print("output result after merge_freeze_bn: ")
print(output_1)
print("output result diff: ")
print(output_0 - output_1)
if __name__ == "__main__":
suite = unittest.TestSuite()
suite.addTest(TestMergeFreezeBNImpl("test"))
runner = unittest.TextTestRunner()
runner.run(suite)
================================================
FILE: tests/test_quant_impl.py
================================================
# -*- coding:utf-8 -*-
import unittest
from ddt import ddt, data
import math
import ctypes
import datetime
from ctypes import *
import numpy as np
from numba import cuda
import numpy as np
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
@ddt
class TestQuantImpl(unittest.TestCase):
max_thres = 512
data0 = np.array([0])
data1 = np.array([v / 25600 + 1.04
for v in range(25600)] + [100, max_thres])
data2 = np.array([v / 25600 + 1.04
for v in range(25600)] + [100, max_thres])
data2 = np.array([-v / 25600 - 1.04
for v in range(25600)] + [-100, -max_thres])
data3 = np.array(
[0, 1, 2, 2.03992188, 2.03996094, 3, 4, 5, 10, 100, max_thres])
max_thres = 513
data4 = np.array([v / 25600 + 1.04
for v in range(25600)] + [100, max_thres])
data5 = np.array([v / 25600 + 1.04
for v in range(25600)] + [100, max_thres])
data6 = np.array([-v / 25600 - 1.04
for v in range(25600)] + [-100, -max_thres])
data7 = np.array(
[0, 1, 2, 2.03992188, 2.03996094, 3, 4, 5, 10, 100, max_thres])
data8 = np.array([
0, -1, -2, -2.03992188, -2.03996094, -3, -4, -5, -10, -100, -max_thres
])
data9 = np.array(range(1234))
data10 = np.array([-v for v in range(1234)])
@data(data0, data1, data2, data3, data4, data5, data6, data7, data8, data9,
data10)
def test(self, data):
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
# load library
dl = ctypes.cdll.LoadLibrary
quant_lib = dl("nnieqat/gpu/lib/libgfpq_gpu.so")
_libcublas = ctypes.cdll.LoadLibrary("libcublas.so")
# struct GFPQ_PARAM_ST in gfpq.hpp
class GFPQ_PARAM_ST(ctypes.Structure):
_fields_ = [("mode", ctypes.c_int), ("buf", ctypes.c_byte * 16)]
class _types:
"""Some alias types."""
handle = ctypes.c_void_p
stream = ctypes.c_void_p
data_origin = data.copy()
print(
"----------------------------------------------------------------------"
)
print("\n\nOriginal data:")
print(data)
data = data.astype(np.float32)
stream = cuda.stream()
_libcublas.cublasCreate_v2.restype = int
_libcublas.cublasCreate_v2.argtypes = [ctypes.c_void_p]
cublas_handle = _types.handle()
_libcublas.cublasCreate_v2(ctypes.byref(cublas_handle))
data_gpu = cuda.to_device(data, stream=stream)
data_p = data_gpu.device_ctypes_pointer
bit_width = 8
param = GFPQ_PARAM_ST()
# init or update param first
param.mode = 0
ret = quant_lib.HI_GFPQ_QuantAndDeQuant_GPU_PY(data_p, data.size,
bit_width,
ctypes.byref(param),
stream.handle,
cublas_handle)
if ret != 0:
print("HI_GFPQ_QuantAndDeQuant failed(%d)\n" % (ret)),
# use apply param
param.mode = 2
ret = quant_lib.HI_GFPQ_QuantAndDeQuant_GPU_PY(data_p, data.size,
bit_width,
ctypes.byref(param),
stream.handle,
cublas_handle)
if ret != 0:
print("HI_GFPQ_QuantAndDeQuant failed(%d)" % (ret)),
data_gpu.copy_to_host(data, stream=stream)
# data may not be available
stream.synchronize()
_libcublas.cublasDestroy_v2(cublas_handle)
import nnieqat
from quant_impl import fake_quantize
import torch
tensor = torch.Tensor(data_origin).cuda()
tensor.data = fake_quantize(tensor.data.detach(), 8)
diff = abs(tensor.cpu().numpy() - data)
# diff_thres = np.max(abs(data)) * 0.001
# print("\nDIFF > 0.1%: ")
# print("idx: ", np.where(diff > diff_thres))
# print("Original data:", data_origin[np.where(diff > diff_thres)])
# print("GFPQ result:", data[np.where(diff > diff_thres)])
# print("Impl result:", tensor.cpu().numpy()[np.where(diff > diff_thres)])
diff_max = np.max(diff)
print("\nDIFF MAX: " + str(diff_max))
print("\nDIFF RATIO: " +
str(diff_max / max(np.max(abs(data)), pow(10, -18))))
if __name__ == "__main__":
suite = unittest.TestSuite()
suite.addTest(TestQuantImpl("test"))
runner = unittest.TextTestRunner()
runner.run(suite)