Repository: aovoc/nnieqat-pytorch
Branch: master
Commit: 91410cf331a1
Files: 36
Total size: 75.0 KB

Directory structure:
gitextract_yis4nxki/

├── LICENSE.txt
├── MANIFEST.in
├── Makefile
├── README.md
├── build_helper.py
├── docker/
│   └── Dockerfile
├── docs/
│   ├── Makefile
│   ├── make.bat
│   └── source/
│       ├── build_helper.rst
│       ├── conf.py
│       ├── index.rst
│       ├── modules.rst
│       ├── nnieqat.cuda10.rst
│       ├── nnieqat.modules.rst
│       ├── nnieqat.rst
│       └── setup.rst
├── nnieqat/
│   ├── __init__.py
│   ├── cuda10/
│   │   ├── LICENSE.txt
│   │   └── lib/
│   │       ├── gfpq.lib
│   │       ├── libgfpq.a
│   │       ├── libgfpq.so.1.1.5
│   │       ├── libgfpq_gpu.a
│   │       └── libgfpq_gpu.so.1.1.5
│   └── quantize.py
├── pyproject.toml
├── setup.cfg
├── setup.py
├── src/
│   ├── fake_quantize.cpp
│   ├── fake_quantize.cu
│   ├── fake_quantize.h
│   └── test/
│       ├── Makefile
│       └── test.cu
└── tests/
    ├── test_cifar10.py
    ├── test_imagenet.py
    ├── test_merge_freeze_bn.py
    └── test_quant_impl.py

================================================
FILE CONTENTS
================================================

================================================
FILE: LICENSE.txt
================================================
MIT License

Copyright (c) Minqin Chen

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: MANIFEST.in
================================================


================================================
FILE: Makefile
================================================
# Uncomment for debugging
# DEBUG := 1
# Pretty build
# Q ?= @

CXX := g++
python := python3
PYTHON_HEADER_DIR := $(shell python -c 'from distutils.sysconfig import get_python_inc; print(get_python_inc())')
PYTORCH_INCLUDES := $(shell python -c 'from torch.utils.cpp_extension import include_paths; [print(p) for p in include_paths()]')
PYTORCH_LIBRARIES := $(shell python -c 'from torch.utils.cpp_extension import library_paths; [print(p) for p in library_paths()]')

CUDA_DIR := $(shell python -c 'from torch.utils.cpp_extension import _find_cuda_home; print(_find_cuda_home())')
WITH_ABI := $(shell python -c 'import torch; print(int(torch._C._GLIBCXX_USE_CXX11_ABI))')
INCLUDE_DIRS := ./ $(CUDA_DIR)/include
INCLUDE_DIRS += $(PYTHON_HEADER_DIR)
INCLUDE_DIRS += $(PYTORCH_INCLUDES)

# Custom (MKL/ATLAS/OpenBLAS) include and lib directories.
# BLAS_INCLUDE := /path/to/your/blas
# BLAS_LIB := /path/to/your/blas

SRC_DIR := ./src
OBJ_DIR := ./obj
CPP_SRCS := $(wildcard $(SRC_DIR)/*.cpp)
CU_SRCS := $(wildcard $(SRC_DIR)/*.cu)
OBJS := $(patsubst $(SRC_DIR)/%.cpp,$(OBJ_DIR)/%.o,$(CPP_SRCS))
CU_OBJS := $(patsubst $(SRC_DIR)/%.cu,$(OBJ_DIR)/cuda/%.o,$(CU_SRCS))
STATIC_LIB := $(OBJ_DIR)/libquant_impl.a


CUDA_ARCH := -gencode arch=compute_50,code=sm_50 \
		-gencode arch=compute_52,code=sm_52 \
		-gencode arch=compute_60,code=sm_60 \
		-gencode arch=compute_61,code=sm_61 \
		-gencode arch=compute_70,code=sm_70 \
		-gencode arch=compute_75,code=sm_75 \
		-gencode arch=compute_75,code=compute_75


LIBRARIES += stdc++ cudart c10 caffe2 torch torch_python caffe2_gpu


ifeq ($(DEBUG), 1)
	COMMON_FLAGS += -DDEBUG -g -O0
	NVCCFLAGS += -g -G # -rdc true
else
	COMMON_FLAGS += -DNDEBUG -O3
endif

WARNINGS := -Wall -Wno-sign-compare -Wcomment
INCLUDE_DIRS += $(BLAS_INCLUDE)
CXXFLAGS += -MMD -MP
COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) \
	     -DTORCH_API_INCLUDE_EXTENSION_H -D_GLIBCXX_USE_CXX11_ABI=$(WITH_ABI)
CXXFLAGS += -pthread -fPIC -fwrapv -std=c++14 $(COMMON_FLAGS) $(WARNINGS)
NVCCFLAGS += -std=c++14 -ccbin=$(CXX) -Xcompiler -fPIC -use_fast_math $(COMMON_FLAGS)

default: $(STATIC_LIB)

$(OBJ_DIR):
	@ mkdir -p $@
	@ mkdir -p $@/cuda

$(OBJ_DIR)/%.o: $(SRC_DIR)/%.cpp | $(OBJ_DIR)
	@ echo CXX $<
	$(Q)$(CXX) $< $(CXXFLAGS) -c -o $@

$(OBJ_DIR)/cuda/%.o: $(SRC_DIR)/%.cu | $(OBJ_DIR)
	@ echo NVCC $<
	$(Q)nvcc $(NVCCFLAGS) $(CUDA_ARCH) -M $< -o ${@:.o=.d} \
		-odir $(@D)
	$(Q)nvcc $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@

$(STATIC_LIB): $(OBJS) $(CU_OBJS) | $(OBJ_DIR)
	$(RM) -f $(STATIC_LIB)
	$(RM) -rf build dist
	@ echo LD -o $@
	ar rc $(STATIC_LIB) $(OBJS) $(CU_OBJS)

build:
	$(python) setup.py build

upload:
	$(python) setup.py sdist bdist_wheel
	#twine upload dist/*

clean:
	$(RM) -rf build dist nnieqat.egg-info

test:
	nosetests -s tests/test_quant_impl.py --nologcapture
	nosetests -s tests/test_merge_freeze_bn.py --nologcapture

lint:
	pylint nnieqat --reports=n

lintfull:
	pylint nnieqat

install:
	$(python) setup.py install 

uninstall:
	$(python) setup.py install --record install.log
	cat install.log | xargs rm -rf 
	$(RM) install.log


================================================
FILE: README.md
================================================
# nnieqat-pytorch

Nnieqat is a quantize aware training package for  Neural Network Inference Engine(NNIE) on pytorch, it uses hisilicon quantization library to quantize module's weight and activation as fake fp32 format.


## Table of Contents

- [nnieqat-pytorch](#nnieqat-pytorch)
  - [Table of Contents](#table-of-contents)
  - [Installation](#installation)
  - [Usage](#usage)
  - [Code Examples](#code-examples)
  - [Results](#results)
  - [Todo](#todo)
  - [Reference](#reference)


<div id="installation"></div>  

## Installation

* Supported Platforms: Linux
* Accelerators and GPUs: NVIDIA GPUs via CUDA driver ***10.1*** or ***10.2***.
* Dependencies:
  * python >= 3.5, < 4
  * llvmlite >= 0.31.0
  * pytorch >= 1.5
  * numba >= 0.42.0
  * numpy >= 1.18.1
* Install nnieqat via pypi:  
  ```shell
  $ pip install nnieqat
  ```

* Install nnieqat in docker(easy way to solve environment problems)：
  ```shell
  $ cd docker
  $ docker build -t nnieqat-image .

  ```
* Install nnieqat via repo：
  ```shell
  $ git clone https://github.com/aovoc/nnieqat-pytorch
  $ cd nnieqat-pytorch
  $ make install
  ```

<div id="usage"></div>

## Usage

* add quantization hook.

  quantize and dequantize weight and data with HiSVP GFPQ library in forward() process.

  ```python

  from nnieqat import quant_dequant_weight, unquant_weight, merge_freeze_bn, register_quantization_hook
  ...
  ...
    register_quantization_hook(model)
  ...
  ```

* merge bn weight into conv and freeze bn

  suggest finetuning from a well-trained model, merge_freeze_bn at beginning. do it after a few epochs of training otherwise.

  ```python
  from nnieqat import quant_dequant_weight, unquant_weight, merge_freeze_bn, register_quantization_hook
  ...
  ...
      model.train()
      model = merge_freeze_bn(model)  #it will change bn to eval() mode during training
  ...
  ```

* Unquantize weight before update it

  ```python
  from nnieqat import quant_dequant_weight, unquant_weight, merge_freeze_bn, register_quantization_hook
  ...
  ...
      model.apply(unquant_weight)  # using original weight while updating
      optimizer.step()
  ...
  ```

* Dump weight optimized model

  ```python
  from nnieqat import quant_dequant_weight, unquant_weight, merge_freeze_bn, register_quantization_hook
  ...
  ...
      model.apply(quant_dequant_weight)
      save_checkpoint(...)
      model.apply(unquant_weight)
  ...
  ```

* Using EMA with caution(Not recommended).

<div id="examples"></div>

## Code Examples

* [Cifar10 quantization aware training example][cifar10_qat]  (add nnieqat into [pytorch_cifar10_tutorial][cifar10_example])

  ```python test/test_cifar10.py```

* [ImageNet quantization finetuning example][imagenet_qat]  (add nnieqat into [pytorh_imagenet_main.py][imagenet_example])

  ```python test/test_imagenet.py  --pretrained  path_to_imagenet_dataset```

<div id="results"></div>

## Results  

* ImageNet

  ```
  python test/test_imagenet.py /data/imgnet/ --arch squeezenet1_1  --lr 0.001 --pretrained --epoch 10   # nnie_lr_e-3_ft
  python pytorh_imagenet_main.py /data/imgnet/ --arch squeezenet1_1  --lr 0.0001 --pretrained --epoch 10  # lr_e-4_ft
  python test/test_imagenet.py /data/imgnet/ --arch squeezenet1_1  --lr 0.0001 --pretrained --epoch 10  # nnie_lr_e-4_ft
  ```

  finetune result：

    |     | trt_fp32 | trt_int8     | nnie     |
    | -------- |  -------- | -------- | -------- |
    | torchvision     | 0.56992  | 0.56424  | 0.56026 |
    | nnie_lr_e-3_ft | 0.56600   | 0.56328   | 0.56612 |
    | lr_e-4_ft  | 0.57884   | 0.57502   | 0.57542 |
    | nnie_lr_e-4_ft | 0.57834   | 0.57524   | 0.57730 |  


* coco

net: simplified  yolov5s

train 300 epoches, hi3559 test result:   

 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.338   
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.540   
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.357   
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.187   
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.377   
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.445   
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.284   
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.484   
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.542   
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.357   
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.595   
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.679   


finetune 20 epoches, hi3559 test result:   

 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.339   
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.539   
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.360   
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.191   
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.378   
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.446   
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.285   
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.485   
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.544   
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.361   
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.596   
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.683   


<div id="Todo"></div>

## Todo

* Generate quantized model directly.

<div id="reference"></div>  

## Reference

HiSVP 量化库使用指南

[Quantizing deep convolutional networks for efficient inference: A whitepaper][quant_whitepaper]

[8-bit Inference with TensorRT][trt_quant]

[Distilling the Knowledge in a Neural Network][distillingNN]

[cifar10_qat]: https://github.com/aovoc/nnieqat-pytorch/blob/master/test/test_cifar10.py

[imagenet_qat]: https://github.com/aovoc/nnieqat-pytorch/blob/master/test/test_imagenet.py

[imagenet_example]: https://github.com/pytorch/examples/blob/master/imagenet/main.py

[cifar10_example]: https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html

[quant_whitepaper]: https://arxiv.org/abs/1806.08342

[trt_quant]: https://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf

[distillingNN]: https://arxiv.org/abs/1503.02531


================================================
FILE: build_helper.py
================================================
import os
import shutil
import subprocess
import sys
import tempfile
from distutils import ccompiler


def print_warning(*lines):
    print('**************************************************')
    for line in lines:
        print('*** WARNING: %s' % line)
    print('**************************************************')


def get_path(key):
    return os.environ.get(key, '').split(os.pathsep)


def search_on_path(filenames):
    for p in get_path('PATH'):
        for filename in filenames:
            full = os.path.join(p, filename)
            if os.path.exists(full):
                return os.path.abspath(full)
    return None


minimum_cuda_version = 10010
maxinum_cuda_version = 10030
minimum_cudnn_version = 7000


def get_compiler_setting():
    nvcc_path = search_on_path(('nvcc', 'nvcc.exe'))
    cuda_path_default = None
    if nvcc_path is None:
        print_warning('nvcc not in path.', 'Please set path to nvcc.')
    else:
        cuda_path_default = os.path.normpath(
            os.path.join(os.path.dirname(nvcc_path), '..'))

    cuda_path = os.environ.get('CUDA_PATH', '')  # Nvidia default on Windows
    if len(cuda_path) > 0 and cuda_path != cuda_path_default:
        print_warning('nvcc path != CUDA_PATH',
                      'nvcc path: %s' % cuda_path_default,
                      'CUDA_PATH: %s' % cuda_path)

    if not os.path.exists(cuda_path):
        cuda_path = cuda_path_default

    if not cuda_path and os.path.exists('/usr/local/cuda'):
        cuda_path = '/usr/local/cuda'

    include_dirs = []
    library_dirs = []
    define_macros = []

    if cuda_path:
        include_dirs.append(os.path.join(cuda_path, 'include'))
        if sys.platform == 'win32':
            library_dirs.append(os.path.join(cuda_path, 'bin'))
            library_dirs.append(os.path.join(cuda_path, 'lib', 'x64'))
        else:
            library_dirs.append(os.path.join(cuda_path, 'lib64'))
            library_dirs.append(os.path.join(cuda_path, 'lib'))
    if sys.platform == 'darwin':
        library_dirs.append('/usr/local/cuda/lib')

    return {
        'include_dirs': include_dirs,
        'library_dirs': library_dirs,
        'define_macros': define_macros,
        'language': 'c++',
    }


def check_cuda_version():
    compiler = ccompiler.new_compiler()
    settings = get_compiler_setting()
    try:
        out = build_and_run(compiler,
                            '''
        #include <cuda.h>
        #include <stdio.h>
        int main(int argc, char* argv[]) {
          printf("%d", CUDA_VERSION);
          return 0;
        }
        ''',
                            include_dirs=settings['include_dirs'])

    except Exception as e:
        print_warning('Cannot check CUDA version', str(e))
        return False

    cuda_version = int(out)
    if cuda_version < minimum_cuda_version:
        print_warning('CUDA version is too old: %d' % cuda_version,
                      'CUDA v10.1 or CUDA v10.2 is required')
        return False
    if cuda_version > maxinum_cuda_version:
        print_warning('CUDA version is too new: %d' % cuda_version,
                      'CUDA v10.1 or CUDA v10.2 is required')

    return True


def check_cudnn_version():
    compiler = ccompiler.new_compiler()
    settings = get_compiler_setting()
    try:
        out = build_and_run(compiler,
                            '''
        #include <cudnn.h>
        #include <stdio.h>
        int main(int argc, char* argv[]) {
          printf("%d", CUDNN_VERSION);
          return 0;
        }
        ''',
                            include_dirs=settings['include_dirs'])

    except Exception as e:
        print_warning('Cannot check cuDNN version\n{0}'.format(e))
        return False

    cudnn_version = int(out)
    if cudnn_version < minimum_cudnn_version:
        print_warning('cuDNN version is too old: %d' % cudnn_version,
                      'cuDNN v7 or newer is required')
        return False

    return True


def build_and_run(compiler,
                  source,
                  libraries=(),
                  include_dirs=(),
                  library_dirs=()):
    temp_dir = tempfile.mkdtemp()

    try:
        fname = os.path.join(temp_dir, 'a.cpp')
        with open(fname, 'w') as f:
            f.write(source)

        objects = compiler.compile([fname],
                                   output_dir=temp_dir,
                                   include_dirs=include_dirs)

        try:
            postargs = ['/MANIFEST'] if sys.platform == 'win32' else []
            compiler.link_executable(objects,
                                     os.path.join(temp_dir, 'a'),
                                     libraries=libraries,
                                     library_dirs=library_dirs,
                                     extra_postargs=postargs,
                                     target_lang='c++')
        except Exception as e:
            msg = 'Cannot build a stub file.\nOriginal error: {0}'.format(e)
            raise Exception(msg)

        try:
            out = subprocess.check_output(os.path.join(temp_dir, 'a'))
            return out

        except Exception as e:
            msg = 'Cannot execute a stub file.\nOriginal error: {0}'.format(e)
            raise Exception(msg)

    finally:
        shutil.rmtree(temp_dir, ignore_errors=True)


================================================
FILE: docker/Dockerfile
================================================
ARG PYTORCH="1.6.0"
ARG CUDA="10.1"
ARG CUDNN="7"

FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel

ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0+PTX"
ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"

RUN apt-get update && apt-get install -y git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*

# Install nnieqat
RUN pip install nnieqat

WORKDIR /root/


================================================
FILE: docs/Makefile
================================================
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS    ?=
SPHINXBUILD   ?= sphinx-build
SOURCEDIR     = source
BUILDDIR      = build

# Put it first so that "make" without argument is like "make help".
help:
	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


================================================
FILE: docs/make.bat
================================================
@ECHO OFF

pushd %~dp0

REM Command file for Sphinx documentation

if "%SPHINXBUILD%" == "" (
	set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build

if "%1" == "" goto help

%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
	echo.
	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
	echo.installed, then set the SPHINXBUILD environment variable to point
	echo.to the full path of the 'sphinx-build' executable. Alternatively you
	echo.may add the Sphinx directory to PATH.
	echo.
	echo.If you don't have Sphinx installed, grab it from
	echo.http://sphinx-doc.org/
	exit /b 1
)

%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end

:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%

:end
popd


================================================
FILE: docs/source/build_helper.rst
================================================
build\_helper module
====================

.. automodule:: build_helper
   :members:
   :undoc-members:
   :show-inheritance:


================================================
FILE: docs/source/conf.py
================================================
# -*- coding: utf-8 -*-
#
import os
import sys
sys.path.insert(0, os.path.abspath('./../../'))


# -- Project information -----------------------------------------------------

project = 'nnieqat'
copyright = '2020, Minqin Chen'
author = 'Minqin Chen'

# The short X.Y version
version = ''
# The full version, including alpha/beta/rc tags
release = '0.1.0'


# -- General configuration ---------------------------------------------------

# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
    'sphinx.ext.todo',
    'sphinx.ext.githubpages',
    'sphinx.ext.autodoc',
]

# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']

# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
source_suffix = '.rst'

# The master toctree document.
master_doc = 'index'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path .
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']

# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'


# -- Options for HTML output -------------------------------------------------

# The theme to use for HTML and HTML Help pages.  See the documentation for
# a list of builtin themes.
#

# Theme options are theme-specific and customize the look and feel of a theme
# further.  For a list of options available for each theme, see the
# documentation.
#
# html_theme_options = {}

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']

# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
#
# The default sidebars (for documents that don't match any pattern) are
# defined by theme itself.  Builtin themes are using these templates by
# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
# 'searchbox.html']``.
#
# html_sidebars = {}
html_theme = 'sphinx_rtd_theme'


================================================
FILE: docs/source/index.rst
================================================
.. nnieqat documentation master file, created by
   sphinx-quickstart on Fri Aug 21 03:52:34 2020.
   You can adapt this file completely to your liking, but it should at least
   contain the root `toctree` directive.

Welcome to nnieqat's documentation!
===================================

.. toctree::
   :maxdepth: 2
   :caption: Contents:


Indices and tables
==================

* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`


================================================
FILE: docs/source/modules.rst
================================================
nnieqat
=======

.. toctree::
   :maxdepth: 4

   nnieqat


================================================
FILE: docs/source/nnieqat.cuda10.rst
================================================
nnieqat.cuda10 package
======================

Submodules
----------

nnieqat.cuda10.quantize module
------------------------------

.. automodule:: nnieqat.cuda10.quantize
   :members:
   :undoc-members:
   :show-inheritance:


Module contents
---------------

.. automodule:: nnieqat.cuda10
   :members:
   :undoc-members:
   :show-inheritance:


================================================
FILE: docs/source/nnieqat.modules.rst
================================================
nnieqat.modules package
=======================

Submodules
----------

nnieqat.modules.conv module
---------------------------

.. automodule:: nnieqat.modules.conv
   :members:
   :undoc-members:
   :show-inheritance:

nnieqat.modules.linear module
-----------------------------

.. automodule:: nnieqat.modules.linear
   :members:
   :undoc-members:
   :show-inheritance:

nnieqat.modules.pooling module
------------------------------

.. automodule:: nnieqat.modules.pooling
   :members:
   :undoc-members:
   :show-inheritance:


Module contents
---------------

.. automodule:: nnieqat.modules
   :members:
   :undoc-members:
   :show-inheritance:


================================================
FILE: docs/source/nnieqat.rst
================================================
nnieqat package
===============

Subpackages
-----------

.. toctree::

   nnieqat.cuda10
   nnieqat.gpu
   nnieqat.modules

Module contents
---------------

.. automodule:: nnieqat
   :members:
   :undoc-members:
   :show-inheritance:


================================================
FILE: docs/source/setup.rst
================================================
setup module
============

.. automodule:: setup
   :members:
   :undoc-members:
   :show-inheritance:


================================================
FILE: nnieqat/__init__.py
================================================
""" quantize aware training package for  Neural Network Inference Engine(NNIE) on pytorch.
"""
import sys
try:
    from .quantize import quant_dequant_weight, unquant_weight, freeze_bn, \
        merge_freeze_bn, register_quantization_hook, test
except:
    raise
__all__ = [
    "quant_dequant_weight", "unquant_weight", "freeze_bn", "merge_freeze_bn", \
        "register_quantization_hook", "test"]
test()


================================================
FILE: nnieqat/cuda10/LICENSE.txt
================================================
/*
 * Copyright (c) 2018, Hisilicon Limited
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */


================================================
FILE: nnieqat/quantize.py
================================================
#!/usr/bin/env python
"""Quantize function.
"""

import ctypes
import datetime
import logging
from os.path import abspath, dirname
import torch
import numpy as np
from numba import cuda
from quant_impl import fake_quantize

_USE_GFPQ_QUANT_LIB = (torch.cuda.device_count() <= 1)


class GFPQParamSt(ctypes.Structure):
    r"""GFPQ param, corresponds with struct GFPQ_PARAM_ST in gfpq.hpp"""
    _fields_ = [("mode", ctypes.c_int), ("param", ctypes.c_byte * 16)]


class _types:
    r"""Some alias types."""
    handle = ctypes.c_void_p
    stream = ctypes.c_void_p


class QuantAndDeQuantGPU():
    r"""quantize and dequantize data with GFPG library.
    """
    def __init__(self,
                 libquant_path=dirname(abspath(__file__)) +
                 "/gpu/lib/libgfpq_gpu.so",
                 libcublas_path="libcublas.so",
                 bit_width=8,
                 param_mode=0):
        global _USE_GFPQ_QUANT_LIB
        self._bit_width = bit_width
        if _USE_GFPQ_QUANT_LIB:
            self._libquant = ctypes.cdll.LoadLibrary(libquant_path)
            self._libcublas = ctypes.cdll.LoadLibrary(libcublas_path)
            self._libcublas.cublasCreate_v2.restype = int
            self._libcublas.cublasCreate_v2.argtypes = [ctypes.c_void_p]
            self._cublas_handle = _types.handle()
            self._libcublas.cublasCreate_v2(ctypes.byref(self._cublas_handle))
            self._param = GFPQParamSt()
            self._stream = cuda.stream()
            self._param.mode = param_mode

    def __call__(self, tensor, mode=0):
        r""" Converts float weights to quantized weights.

        Args:
            - tensor: input data
            - mode: GFPQ mode for param
                GFPQ_MODE_INIT(0): There is no valid parameter in param[].
                    Generate the parameter and filled in param[].
                GFPQ_MODE_UPDATE(1): There is parameter in param[]. Generate
                    new parameter, update param[] when the new parameter is
                    better.
                GFPQ_MODE_APPLY_ONLY(2): There is parameter in param[]. Don't
                    generate parameter. Just use the param[].
        """

        global _USE_GFPQ_QUANT_LIB
        if _USE_GFPQ_QUANT_LIB:
            try:
                if isinstance(tensor, tuple):
                    for tensor_item in tensor:
                        data_cuda_array = cuda.as_cuda_array(
                            tensor_item.data.detach())
                        data_p = data_cuda_array.device_ctypes_pointer
                        self._param.mode = mode
                        ret = self._libquant.HI_GFPQ_QuantAndDeQuant_GPU_PY(
                            data_p, data_cuda_array.size, self._bit_width,
                            ctypes.byref(self._param), self._stream.handle,
                            self._cublas_handle)
                else:
                    data_cuda_array = cuda.as_cuda_array(tensor.data.detach())
                    data_p = data_cuda_array.device_ctypes_pointer
                    self._param.mode = mode
                    ret = self._libquant.HI_GFPQ_QuantAndDeQuant_GPU_PY(
                        data_p, data_cuda_array.size, self._bit_width,
                        ctypes.byref(self._param), self._stream.handle,
                        self._cublas_handle)
            except:
                pass
            finally:
                if ret != 0:
                    _USE_GFPQ_QUANT_LIB = False
                    logger = logging.getLogger(__name__)
                    logger.setLevel(logging.WARNING)
                    logger.warning(
                        """Failed to quantize data with default HiSVP GFPQ library,
                        Use implemented quantization algorithm instead.""")
                    if isinstance(tensor, tuple):
                        for tensor_item in tensor:
                            tensor_item.data = fake_quantize(
                                tensor_item.data.detach().clone(), self._bit_width)
                    else:
                        tensor.data = fake_quantize(tensor.data.detach().clone(),
                                                    self._bit_width)
        else:
            if isinstance(tensor, tuple):
                for tensor_item in tensor:
                    tensor_item.data = fake_quantize(tensor_item.data.detach().clone(),
                                                     self._bit_width)
            else:
                tensor.data = fake_quantize(tensor.data.detach().clone(),
                                            self._bit_width)
        return tensor


_QUANT_HANDLE = QuantAndDeQuantGPU()


def _fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b):
    """ fuse convolution and batch norm's weight.

    Args:
        conv_w (torch.nn.Parameter): convolution weight.
        conv_b (torch.nn.Parameter): convolution bias.
        bn_rm (torch.nn.Parameter): batch norm running mean.
        bn_rv (torch.nn.Parameter): batch norm running variance.
        bn_eps (torch.nn.Parameter): batch norm epsilon.
        bn_w (torch.nn.Parameter): batch norm weight.
        bn_b (torch.nn.Parameter): batch norm weight.

    Returns:
        conv_w(torch.nn.Parameter): fused convolution weight.
        conv_b(torch.nn.Parameter): fused convllution bias.
    """

    if conv_b is None:
        conv_b = bn_rm.new_zeros(bn_rm.shape)
    bn_var_rsqrt = torch.rsqrt(bn_rv + bn_eps)

    conv_w = conv_w * \
        (bn_w * bn_var_rsqrt).reshape([-1] + [1] * (len(conv_w.shape) - 1))
    conv_b = (conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b

    return torch.nn.Parameter(conv_w), torch.nn.Parameter(conv_b)


def _fuse_conv_bn(conv, bn):
    conv.weight, conv.bias = \
        _fuse_conv_bn_weights(conv.weight, conv.bias,
                             bn.running_mean, bn.running_var, bn.eps, bn.weight, bn.bias)
    return conv


def _fuse_modules(model):
    r"""Fuses a list of modules into a single module

    Fuses only the following sequence of modules:
    conv, bn
    All other sequences are left unchanged.
    For these sequences, fuse modules on weight level, keep model structure unchanged.

    Arguments:
        model: Model containing the modules to be fused

    Returns:
        model with fused modules.

    """
    children = list(model.named_children())
    conv_module = None
    conv_name = None

    for name, child in children:
        if isinstance(child, (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d,
                              torch.nn.BatchNorm3d)):
            if isinstance(conv_module, (torch.nn.Conv2d, torch.nn.Conv3d)):
                conv_module = _fuse_conv_bn(conv_module, child)
                model._modules[conv_name] = conv_module
                child.eval()
                child.running_mean = child.running_mean.new_full(
                    child.running_mean.shape, 0)
                child.running_var = child.running_var.new_full(
                    child.running_var.shape, 1)
                if child.weight is not None:
                    child.weight.data = child.weight.data.new_full(
                        child.weight.shape, 1)
                if child.bias is not None:
                    child.bias.data = child.bias.data.new_full(
                        child.bias.shape, 0)
                child.track_running_stats = False
                child.momentum = 0
                child.eps = 0
            conv_module = None
        elif isinstance(child, (torch.nn.Conv2d, torch.nn.Conv3d)):
            conv_module = child
            conv_name = name
        else:
            _fuse_modules(child)
    return model


def freeze_bn(m, freeze_bn_affine=True):
    """Freeze batch normalization.
        reference: https://arxiv.org/abs/1806.08342


    Args:
        - m (nn.module): torch module
        - freeze_bn_affine (bool, optional): Freeze affine scale and
        translation factor or not. Defaults: True.
    """

    if isinstance(
            m,
        (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d)):

        m.eval()
        if freeze_bn_affine:
            m.weight.requires_grad = False
            m.bias.requires_grad = False


def merge_freeze_bn(model):
    """merge batch norm's weight into convolution, then freeze it.

    Args:
        model (nn.module): model.

    Returns:
        [nn.module]: model.
    """
    model = _fuse_modules(model)
    model.apply(freeze_bn)
    return model


def unquant_weight(m):
    """ unquantize weight before update weight, avoid training turbulence.

    Args:
        - m (nn.module): torch module.
    """
    try:
        if hasattr(m, "weight_origin") and m.weight is not None:
            m.weight.data.copy_(m.weight_origin.data)
    except AttributeError:
        pass
    except TypeError:
        pass


def quant_dequant_weight(m):
    """ quant weight manually.

    Args:
        - m (nn.module): torch module.
    """
    global _QUANT_HANDLE
    global _USE_GFPQ_QUANT_LIB
    quant_handle = _QUANT_HANDLE
    if not _USE_GFPQ_QUANT_LIB:
        quant_handle = QuantAndDeQuantGPU()
    try:
        if hasattr(m, "weight_origin") and m.weight is not None:
            m.weight_origin.data.copy_(m.weight.data)
            m.weight.data = quant_handle(m.weight.data.detach().clone())
    except AttributeError:
        pass
    except TypeError:
        pass


def _quantizing_activation(module, input, output):
    if isinstance(
            module,
        (torch.nn.ReLU, torch.nn.ELU, torch.nn.LeakyReLU, torch.nn.PReLU)):
        global _QUANT_HANDLE
        global _USE_GFPQ_QUANT_LIB
        quant_handle = _QUANT_HANDLE
        if not _USE_GFPQ_QUANT_LIB:
            quant_handle = QuantAndDeQuantGPU()
        # print("quantizing activation.")
        # print(output[0][0][0])
        output_type = output.dtype
        module.activation_max_value = torch.max(torch.max(torch.abs(output.detach())), module.activation_max_value.to(output_type))
        # print(module.activation_max_value)
        tensor_t = torch.cat((output, torch.ones(output[0].shape).cuda().unsqueeze(0) * module.activation_max_value))
        output.data = quant_handle(tensor_t.float())[:-1]
        output = output.to(output_type)
        # print(output[0][0][0])


def _quantizing_data(module, input):
    global _QUANT_HANDLE
    global _USE_GFPQ_QUANT_LIB
    quant_handle = _QUANT_HANDLE
    if not _USE_GFPQ_QUANT_LIB:
        quant_handle = QuantAndDeQuantGPU()
    # print("quantizing data.")
    # print(input[0][0][0])
    # print("quantizing data.")
    # print(input[0][0][0])
    # input_type = input.dtype
    if isinstance(input, tuple):
        for item in input:
            item_type = item.dtype
            item = quant_handle(item.float())
            item.to(item_type)
    else:
        input = quant_handle(input.float())
    # input = input.to(input_type)
    # print(input[0][0][0])


def _quantizing_weight(module, input):
    global _QUANT_HANDLE
    global _USE_GFPQ_QUANT_LIB
    quant_handle = _QUANT_HANDLE
    if not _USE_GFPQ_QUANT_LIB:
        quant_handle = QuantAndDeQuantGPU()
    # print("quantizing weight.")
    # print(module.weight[0][0][0])
    module.weight_origin.data.copy_(module.weight.data)
    module.weight.data = quant_handle(module.weight.data.detach().clone())
    # print(module.weight[0][0][0])


def register_quantization_hook(model,
                               quant_weight=True,
                               quant_activation=True,
                               quant_data=False):
    """register quantization hook for model.

    Args:
        model (:class:`Module`): Module.

    Returns:
        Module: self
    """

    #  weight quantizing.
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)

    for _, module in model._modules.items():
        if len(list(module.children())) > 0:
            register_quantization_hook(module, quant_weight, quant_activation)
        else:
            if quant_weight and hasattr(
                    module,
                    "weight") and module.weight is not None and not isinstance(
                        module, (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d,
                                 torch.nn.BatchNorm3d)):
                module.register_buffer('weight_origin', module.weight.detach().clone())
                if quant_data:
                    module.register_forward_pre_hook(_quantizing_data)
                    logger.info("Quantizing input data of %s", str(module))
                module.register_forward_pre_hook(_quantizing_weight)
                logger.info("Quantizing weight of %s", str(module))

            if quant_activation and isinstance(
                    module, (torch.nn.ReLU, torch.nn.ELU, torch.nn.LeakyReLU, torch.nn.PReLU)):
                module.register_buffer("activation_max_value", torch.tensor(0, dtype=torch.float).cuda())
                module.register_forward_hook(_quantizing_activation)
                logger.info("Quantizing activation of %s", str(module))

    return model


def test():
    r""" Test GFPG library QuantAndDeQuantGPU.
    """
    quant_handle = QuantAndDeQuantGPU()
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    tensor = torch.Tensor(np.array([-9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])).cuda()
    logging.info("Origin Data: ")
    logging.info(tensor)

    start_time = datetime.datetime.now()
    quant_tensor = quant_handle(tensor)
    end_time = datetime.datetime.now()

    logging.info("Quant Data: ")
    logging.info(quant_tensor)

    data_expected = np.array([
        -8.7240619659, 0.0000000000, 1.0000000000, 2.0000000000, 2.9536523819,
        4.0000000000, 4.9674310684, 5.9073047638, 7.0250086784, 8.0000000000,
        8.7240619659
    ])

    logging.info("Data expected:  ")
    logging.info(" ".join([str(v) for v in data_expected]))

    data_diff = quant_tensor.data.detach().cpu().numpy() - data_expected
    flag = "success."
    for num in data_diff:
        if abs(num) > 0.000000001:
            flag = "failed."

    run_time = end_time - start_time
    logging.info("QuantAndDeQuantGPU time: %s", str(run_time))
    logging.info("QuantAndDeQuantGPU %s", flag)


================================================
FILE: pyproject.toml
================================================
[build-system]
requires = ["setuptools>=40.8.0", "wheel"]
build-backend = "setuptools.build_meta"


================================================
FILE: setup.cfg
================================================
[metadata]
license_files = LICENSE.txt


================================================
FILE: setup.py
================================================
from setuptools import setup, find_packages
import pathlib
from torch.utils.cpp_extension import BuildExtension, CUDAExtension

from build_helper import check_cuda_version
assert(check_cuda_version())

import os
os.system('make -j%d' % os.cpu_count())

here = pathlib.Path(__file__).parent.resolve()
long_description = (here / 'README.md').read_text(encoding='utf-8')

setup(
    name='nnieqat',
    version='0.1.0',
    description='A nnie quantization aware training tool on pytorch.',
    long_description=long_description,
    long_description_content_type='text/markdown',
    url='https://github.com/aovoc/nnieqat-pytorch',
    author='Minqin Chen',
    author_email='minqinchen@deepglint.com',
    license='MIT',
    classifiers=[
        'Development Status :: 5 - Production/Stable',
        "Intended Audience :: Science/Research",
        'Intended Audience :: Developers',
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
        "Topic :: Software Development :: Libraries :: Python Modules",
        'License :: OSI Approved :: MIT License',
        'Programming Language :: Python :: 3',
        'Programming Language :: Python :: 3.5',
        'Programming Language :: Python :: 3.6',
        'Programming Language :: Python :: 3.7',
        'Programming Language :: Python :: 3.8',
        'Programming Language :: Python :: 3 :: Only',
    ],
    keywords=[
        "quantization aware training",
        "deep learning",
        "neural network",
        "CNN",
        "machine learning",
    ],
    packages=find_packages(),
    package_data={
        "nnieqat": ["gpu/lib/*gfpq*"],
    },
    python_requires='>=3.5, <4',
    install_requires=[
        "torch>=1.5",
        "numba>=0.42.0",
        "numpy>=1.18.1"
    ],
    extras_require={
        'test': ["torchvision>=0.4",
                 "nose",
                 "ddt"
                 ],
        'docs': [
            'sphinx==2.4.4',
            'sphinx_rtd_theme'
        ]
    },
    ext_modules=[
        CUDAExtension(
            name="quant_impl",
            sources=[
                "./src/fake_quantize.cpp",
            ],
            libraries=['quant_impl'],
            library_dirs=['obj'],
        )
    ],
    cmdclass={'build_ext': BuildExtension},
    test_suite="nnieqat.test.test_cifar10",
)


================================================
FILE: src/fake_quantize.cpp
================================================
#include "fake_quantize.h"

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

Tensor fake_quantize(Tensor a, int bit_width){
  CHECK_INPUT(a);
  return fake_quantize_cuda(a, bit_width);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){
  m.def("fake_quantize", &fake_quantize, "NNIE Fake Quantization (CUDA)");
}

================================================
FILE: src/fake_quantize.cu
================================================
#include "fake_quantize.h"
__global__ void fake_quantize_kernel_cuda(float* __restrict__ a,
                                            float* o, int size,
                                            float* max_entry,
                                            int bit_width) {
    if(bit_width!=8) bit_width =16;
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (index < size) {
        if((*max_entry) < 1e-15 && (*max_entry) > -1e-15){
            o[index] = 0;
            return;
        }

        if(bit_width == 8){
            float data_max = (*max_entry);
            int max_entry_qdata_int =  floorf(__log2f(data_max) * 16) + 1;
            data_max = __powf(2, __fdividef(max_entry_qdata_int, 16));
            float data_max_floor = __powf(2, __fdividef(max_entry_qdata_int-1, 16));

            if(a[index] <= data_max_floor * 0.0020395972313035  // exp(ln(256) / 128) / 512= 2^(1/16-9) = 1.0442737824274 /512 = 0.0020395972313035
                && a[index] > - data_max * 0.0020395972313035){  
                o[index] = 0;
                return;
            }

            //int qdata_int = (int)(log(256 * a[index] / data_max ) / 0.04332169878499658);  //ln(256) / 128 =  0.04332169878499658
            int qdata_int = 0;
            if(a[index] > 0){
                qdata_int = rintf(__fdividef(  __logf(__fdividef(256* a[index],data_max)), 0.04332169878499658));  //ln(256) / 128 =  0.04332169878
                if(qdata_int > 127) qdata_int = 127;
                else if(qdata_int < 0) qdata_int = 0;   
                o[index] =  __fdividef(data_max , 256.0) *  __expf(qdata_int*0.04332169878499658);   
            }
            else{
                qdata_int = - rintf(__fdividef(  __logf(__fdividef(- 256* a[index], data_max)), 0.04332169878499658));  //ln(256) / 128 =  0.04332169878
                if(qdata_int < -127) qdata_int = -127;
                else if(qdata_int >-1) qdata_int = -1;
                o[index] = - __fdividef(data_max , 256.0) * __expf(- qdata_int*0.04332169878499658);
            }

        }
        else{
            float data_max = (*max_entry);
            int max_entry_qdata_int =  floorf(__log2f(data_max) * 128) + 1;
            data_max = __powf(2, __fdividef(max_entry_qdata_int, 128));
            float data_max_floor = __powf(2, __fdividef(max_entry_qdata_int-1, 16));

            
            if(a[index] < data_max_floor *0.0019537861485404  //exp(ln(2^16)/(2^15)) / 512 = 0.0019537861485404
                && a[index] > - data_max * 0.0019537861485404){ 
                o[index] = 0;
                return;
            }

            int qdata_int = 0;
            if(a[index] > 0){
                qdata_int = rintf(__fdividef(  __logf(__fdividef(65536* a[index], data_max)), 0.00033845077175779)); 
                if(qdata_int > 32767) qdata_int = 32767;
                else if(qdata_int <0) qdata_int = 0;
                o[index] =  __fdividef(data_max , 65536.0) * __expf(qdata_int * 0.00033845077175779); 
            }
            else{
                qdata_int = - rintf(__fdividef(  __logf(__fdividef(- 65536* a[index], data_max)), 0.00033845077175779));
                if(qdata_int < -32767) qdata_int = -32767;
                else if(qdata_int >-1) qdata_int = -1;
                o[index] = - __fdividef(data_max , 65536.0) * __expf(- qdata_int * 0.00033845077175779);  
            }
        }

    }
}


Tensor fake_quantize_cuda(Tensor a, int bit_width) {
    auto o = at::zeros_like(a);
    int64_t size = a.numel();
  
    Tensor max_entry = at::max(at::abs(a));
    int blockSize = 1024;
    int blockNums = (size + blockSize - 1) / blockSize;
  
    fake_quantize_kernel_cuda<<<blockNums, blockSize>>>(a.data_ptr<float>(),
                                                        o.data_ptr<float>(),
                                                        size,
                                                        max_entry.data_ptr<float>(),
                                                        bit_width);
    return o;
  }


================================================
FILE: src/fake_quantize.h
================================================
#include <cstdlib>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <climits>
#include <stdint.h>
#include <tuple>
#include <ATen/ATen.h>
#include <torch/torch.h>

using namespace at;

Tensor fake_quantize(Tensor a, int bit_width=8);

Tensor fake_quantize_cuda(Tensor a, int bit_width=8);

__global__ void fake_quantize_kernel_cuda(float* __restrict__ a,
                                            float* o, int size,
                                            float* max_entry,
                                            int bit_width=8);


================================================
FILE: src/test/Makefile
================================================
# Uncomment for debugging
DEBUG := 1
# Pretty build
# Q ?= @

CXX := g++
python := python3
PYTHON_HEADER_DIR := $(shell python -c 'from distutils.sysconfig import get_python_inc; print(get_python_inc())')
PYTORCH_INCLUDES := $(shell python -c 'from torch.utils.cpp_extension import include_paths; [print(p) for p in include_paths()]')
PYTORCH_LIBRARIES := $(shell python -c 'from torch.utils.cpp_extension import library_paths; [print(p) for p in library_paths()]')

CUDA_DIR := $(shell python -c 'from torch.utils.cpp_extension import _find_cuda_home; print(_find_cuda_home())')
WITH_ABI := $(shell python -c 'import torch; print(int(torch._C._GLIBCXX_USE_CXX11_ABI))')
INCLUDE_DIRS := ./ $(CUDA_DIR)/include
INCLUDE_DIRS += $(PYTHON_HEADER_DIR)
INCLUDE_DIRS += $(PYTORCH_INCLUDES)

# Custom (MKL/ATLAS/OpenBLAS) include and lib directories.
# BLAS_INCLUDE := /path/to/your/blas
# BLAS_LIB := /path/to/your/blas

SRC_DIR := ./
OBJ_DIR := ./obj
CPP_SRCS := $(wildcard $(SRC_DIR)/*.cpp)
CU_SRCS := $(wildcard $(SRC_DIR)/*.cu)
OBJS := $(patsubst $(SRC_DIR)/%.cpp,$(OBJ_DIR)/%.o,$(CPP_SRCS))
CU_OBJS := $(patsubst $(SRC_DIR)/%.cu,$(OBJ_DIR)/cuda/%.o,$(CU_SRCS))
STATIC_LIB := $(OBJ_DIR)/libquant_impl.a


CUDA_ARCH := -gencode arch=compute_50,code=sm_50 \
		-gencode arch=compute_52,code=sm_52 \
		-gencode arch=compute_60,code=sm_60 \
		-gencode arch=compute_61,code=sm_61 \
		-gencode arch=compute_70,code=sm_70 \
		-gencode arch=compute_75,code=sm_75 \
		-gencode arch=compute_75,code=compute_75


LIBRARIES += stdc++ cudart c10 caffe2 torch torch_python caffe2_gpu


ifeq ($(DEBUG), 1)
	COMMON_FLAGS += -DDEBUG -g -O0
	NVCCFLAGS += -g -G # -rdc true
else
	COMMON_FLAGS += -DNDEBUG -O3
endif

WARNINGS := -Wall -Wno-sign-compare -Wcomment
INCLUDE_DIRS += $(BLAS_INCLUDE)
CXXFLAGS += -MMD -MP
COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) \
	     -DTORCH_API_INCLUDE_EXTENSION_H -D_GLIBCXX_USE_CXX11_ABI=$(WITH_ABI)
CXXFLAGS += -pthread -fPIC -fwrapv -std=c++14 $(COMMON_FLAGS) $(WARNINGS)
NVCCFLAGS += -std=c++14 -ccbin=$(CXX) -Xcompiler -fPIC -use_fast_math $(COMMON_FLAGS)

default: $(STATIC_LIB)

$(OBJ_DIR):
	@ mkdir -p $@
	@ mkdir -p $@/cuda

$(OBJ_DIR)/%.o: $(SRC_DIR)/%.cpp | $(OBJ_DIR)
	@ echo CXX $<
	$(Q)$(CXX) $< $(CXXFLAGS) -c -o $@

$(OBJ_DIR)/cuda/%.o: $(SRC_DIR)/%.cu | $(OBJ_DIR)
	@ echo NVCC $<
	$(Q)nvcc $(NVCCFLAGS) $(CUDA_ARCH) -M $< -o ${@:.o=.d} \
		-odir $(@D)
	$(Q)nvcc $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@

$(STATIC_LIB): $(OBJS) $(CU_OBJS) | $(OBJ_DIR)
	$(RM) -f $(STATIC_LIB)
	$(RM) -rf build dist
	@ echo LD -o $@
	ar rc $(STATIC_LIB) $(OBJS) $(CU_OBJS)

build:
	$(python) setup.py build

upload:
	$(python) setup.py sdist bdist_wheel
	#twine upload dist/*

clean:
	$(RM) -rf build dist nnieqat.egg-info obj

test:
	nosetests -s tests/test_quant_impl.py --nologcapture

lint:
	pylint nnieqat --reports=n

lintfull:
	pylint nnieqat

install:
	$(python) setup.py install 

uninstall:
	$(python) setup.py install --record install.log
	cat install.log | xargs rm -rf 
	$(RM) install.log


================================================
FILE: src/test/test.cu
================================================
#include <stdio.h>
#include "../fake_quantize.h"

int main(int argc, char *argv[])
{
	Tensor input = randn({2, 2});
	fake_quantize(input, 8);
	return 0;
}


================================================
FILE: tests/test_cifar10.py
================================================
# -*- coding:utf-8 -*-
from nnieqat import quant_dequant_weight, unquant_weight, merge_freeze_bn, register_quantization_hook
import unittest
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import torchvision
import torchvision.transforms as transforms


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = torch.nn.Conv2d(3, 6, 5)
        self.pool = torch.nn.MaxPool2d(2, 2)
        self.conv2 = torch.nn.Conv2d(6, 16, 5)
        self.fc1 = torch.nn.Linear(16 * 5 * 5, 120)
        self.fc2 = torch.nn.Linear(120, 84)
        self.fc3 = torch.nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class TestCifar10(unittest.TestCase):
    def test(self):
        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])
        trainset = torchvision.datasets.CIFAR10(root='./data',
                                                train=True,
                                                download=True,
                                                transform=transform)
        trainloader = torch.utils.data.DataLoader(trainset,
                                                  batch_size=4,
                                                  shuffle=True,
                                                  num_workers=2)
        testset = torchvision.datasets.CIFAR10(root='./data',
                                               train=False,
                                               download=True,
                                               transform=transform)
        testloader = torch.utils.data.DataLoader(testset,
                                                 batch_size=4,
                                                 shuffle=True,
                                                 num_workers=2)

        dataiter = iter(trainloader)
        images, labels = dataiter.next()
        net = Net()
        register_quantization_hook(net)
        net.cuda()
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)


        print("Cifar10 training:")
        for epoch in range(5):
            net.train()
            if epoch > 2:
                net = merge_freeze_bn(net)
            running_loss = 0.0
            for i, data in enumerate(trainloader, 0):
                inputs, labels = data
                inputs, labels = Variable(inputs.cuda()), Variable(
                    labels.cuda())
                optimizer.zero_grad()
                outputs = net(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                net.apply(unquant_weight)
                optimizer.step()

                running_loss += loss.item()
                if i % 2000 == 1999:
                    print(' epoch %3d, Iter %5d, loss: %.3f' %
                                (epoch + 1, i + 1, running_loss / 2000))
                    running_loss = 0.0
        print('Finished Training.')

        # net.apply(quant_dequant_weight)
        correct = total = 0
        for data in testloader:
            images, labels = data
            outputs = net(Variable(images.cuda()))
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels.cuda()).sum()
            total += labels.size(0)
        print(
            'Accuracy(10000 test images, modules\' weight unquantize): %d %%' %
            (100.0 * correct / total))


if __name__ == "__main__":
    suite = unittest.TestSuite()
    suite.addTest(TestCifar10("test"))
    runner = unittest.TextTestRunner()
    runner.run(suite)


================================================
FILE: tests/test_imagenet.py
================================================
import argparse
import os
import random
import shutil
import time
import warnings

from nnieqat import quant_dequant_weight, unquant_weight, merge_freeze_bn, register_quantization_hook
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models

model_names = sorted(name for name in models.__dict__
    if name.islower() and not name.startswith("__")
    and callable(models.__dict__[name]))

parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
parser.add_argument('data', metavar='DIR',
                    help='path to dataset')
parser.add_argument('-a', '--arch', metavar='ARCH', default='squeezenet1_1',
                    choices=model_names,
                    help='model architecture: ' +
                        ' | '.join(model_names) +
                        ' (default: resnet18)')
parser.add_argument('-j', '--workers', default=32, type=int, metavar='N',
                    help='number of data loading workers (default: 4)')
parser.add_argument('--epochs', default=120, type=int, metavar='N',
                    help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
                    help='manual epoch number (useful on restarts)')
parser.add_argument('-b', '--batch-size', default=256, type=int,
                    metavar='N',
                    help='mini-batch size (default: 256), this is the total '
                         'batch size of all GPUs on the current node when '
                         'using Data Parallel or Distributed Data Parallel')
parser.add_argument('--lr', '--learning-rate', default=0.001, type=float,
                    metavar='LR', help='initial learning rate', dest='lr')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
                    help='momentum')
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
                    metavar='W', help='weight decay (default: 1e-4)',
                    dest='weight_decay')
parser.add_argument('-p', '--print-freq', default=10, type=int,
                    metavar='N', help='print frequency (default: 10)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
                    help='path to latest checkpoint (default: none)')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
                    help='evaluate model on validation set')
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
                    help='use pre-trained model')
parser.add_argument('--world-size', default=-1, type=int,
                    help='number of nodes for distributed training')
parser.add_argument('--rank', default=-1, type=int,
                    help='node rank for distributed training')
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
                    help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
                    help='distributed backend')
parser.add_argument('--seed', default=None, type=int,
                    help='seed for initializing training. ')
parser.add_argument('--gpu', default=None, type=int,
                    help='GPU id to use.')
parser.add_argument('--multiprocessing-distributed', action='store_true',
                    help='Use multi-processing distributed training to launch '
                         'N processes per node, which has N GPUs. This is the '
                         'fastest way to use PyTorch for either single node or '
                         'multi node data parallel training')

best_acc1 = 0


def main():
    args = parser.parse_args()

    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    if args.gpu is not None:
        warnings.warn('You have chosen a specific GPU. This will completely '
                      'disable data parallelism.')

    if args.dist_url == "env://" and args.world_size == -1:
        args.world_size = int(os.environ["WORLD_SIZE"])

    args.distributed = args.world_size > 1 or args.multiprocessing_distributed

    ngpus_per_node = torch.cuda.device_count()
    if args.multiprocessing_distributed:
        # Since we have ngpus_per_node processes per node, the total world_size
        # needs to be adjusted accordingly
        args.world_size = ngpus_per_node * args.world_size
        # Use torch.multiprocessing.spawn to launch distributed processes: the
        # main_worker process function
        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
    else:
        # Simply call main_worker function
        main_worker(args.gpu, ngpus_per_node, args)


def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    args.gpu = gpu

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size, rank=args.rank)
    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()

    register_quantization_hook(model)

    if not torch.cuda.is_available():
        print('using CPU, this will be slow')
    elif args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    optimizer = torch.optim.SGD(model.parameters(), args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True, sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args)

        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if not args.multiprocessing_distributed or (args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            # dump weight quantized model.
            model.apply(quant_dequant_weight)
            save_checkpoint({
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                'optimizer': optimizer.state_dict(),
            }, is_best)
            model.apply(unquant_weight)


def train(train_loader, model, criterion, optimizer, epoch, args):
    batch_time = AverageMeter('Time', ':6.3f')
    data_time = AverageMeter('Data', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, top1, top5],
        prefix="Epoch: [{}]".format(epoch))

    # switch to train mode
    model.train()
    model = merge_freeze_bn(model)
    end = time.time()

    for i, (images, target) in enumerate(train_loader): 
        # measure data loading time
        data_time.update(time.time() - end)

        if args.gpu is not None:
            images = images.cuda(args.gpu, non_blocking=True)
        if torch.cuda.is_available():
            target = target.cuda(args.gpu, non_blocking=True)

        # compute output
        output = model(images)
        loss = criterion(output, target)

        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), images.size(0))
        top1.update(acc1[0], images.size(0))
        top5.update(acc5[0], images.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        model.apply(unquant_weight)
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            progress.display(i)


def validate(val_loader, model, criterion, args):
    batch_time = AverageMeter('Time', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(
        len(val_loader),
        [batch_time, losses, top1, top5],
        prefix='Test: ')

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (images, target) in enumerate(val_loader):
            if args.gpu is not None:
                images = images.cuda(args.gpu, non_blocking=True)
            if torch.cuda.is_available():
                target = target.cuda(args.gpu, non_blocking=True)

            # compute output
            output = model(images)
            loss = criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % args.print_freq == 0:
                progress.display(i)

        # TODO: this should also be done with the ProgressMeter
        print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
              .format(top1=top1, top5=top5))

    return top1.avg


def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)


class ProgressMeter(object):
    def __init__(self, num_batches, meters, prefix=""):
        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
        self.meters = meters
        self.prefix = prefix

    def display(self, batch):
        entries = [self.prefix + self.batch_fmtstr.format(batch)]
        entries += [str(meter) for meter in self.meters]
        print('\t'.join(entries))

    def _get_batch_fmtstr(self, num_batches):
        num_digits = len(str(num_batches // 1))
        fmt = '{:' + str(num_digits) + 'd}'
        return '[' + fmt + '/' + fmt.format(num_batches) + ']'


def adjust_learning_rate(optimizer, epoch, args):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    lr = args.lr * (0.975 ** (epoch // 3))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res


if __name__ == '__main__':
    main()


================================================
FILE: tests/test_merge_freeze_bn.py
================================================
# -*- coding:utf-8 -*-
import unittest
from ddt import ddt, data
import torch
from torch import nn
from nnieqat import merge_freeze_bn, freeze_bn


@ddt
class TestMergeFreezeBNImpl(unittest.TestCase):
    def conv_bn(inp,
                oup,
                stride,
                conv_layer=nn.Conv2d,
                norm_layer=nn.BatchNorm2d):
        return nn.Sequential(conv_layer(inp, oup, 3, stride, 1, bias=False),
                             norm_layer(oup))

    def conv_1x1_bn(inp, oup, conv_layer=nn.Conv2d, norm_layer=nn.BatchNorm2d):
        return nn.Sequential(conv_layer(inp, oup, 1, 1, 0, bias=False),
                             norm_layer(oup))

    data1 = conv_bn(3, 3, 2)
    data2 = conv_1x1_bn(3, 3)

    @data(data1, data2)
    def test(self, m):
        input = torch.randn(1, 3, 10, 10)
        m.eval()
        output_0 = m(input)
        print("module parameter before merge_freeze_bn: ")
        print(list(m.named_parameters()))

        m = merge_freeze_bn(m)
        m.eval()
        output_1 = m(input)
        print("module parameter after merge_freeze_bn: ")
        print(list(m.named_parameters()))

        print("output result before merge_freeze_bn: ")
        print(output_0)
        print("output result after merge_freeze_bn: ")
        print(output_1)
        print("output result diff: ")
        print(output_0 - output_1)


if __name__ == "__main__":
    suite = unittest.TestSuite()
    suite.addTest(TestMergeFreezeBNImpl("test"))
    runner = unittest.TextTestRunner()
    runner.run(suite)


================================================
FILE: tests/test_quant_impl.py
================================================
# -*- coding:utf-8 -*-
import unittest
from ddt import ddt, data
import math
import ctypes
import datetime
from ctypes import *
import numpy as np
from numba import cuda
import numpy as np
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

@ddt
class TestQuantImpl(unittest.TestCase):
    max_thres = 512
    data0 = np.array([0])
    data1 = np.array([v / 25600 + 1.04
                      for v in range(25600)] + [100, max_thres])
    data2 = np.array([v / 25600 + 1.04
                      for v in range(25600)] + [100, max_thres])
    data2 = np.array([-v / 25600 - 1.04
                      for v in range(25600)] + [-100, -max_thres])
    data3 = np.array(
        [0, 1, 2, 2.03992188, 2.03996094, 3, 4, 5, 10, 100, max_thres])
    max_thres = 513
    data4 = np.array([v / 25600 + 1.04
                      for v in range(25600)] + [100, max_thres])
    data5 = np.array([v / 25600 + 1.04
                      for v in range(25600)] + [100, max_thres])
    data6 = np.array([-v / 25600 - 1.04
                      for v in range(25600)] + [-100, -max_thres])
    data7 = np.array(
        [0, 1, 2, 2.03992188, 2.03996094, 3, 4, 5, 10, 100, max_thres])
    data8 = np.array([
        0, -1, -2, -2.03992188, -2.03996094, -3, -4, -5, -10, -100, -max_thres
    ])
    data9 = np.array(range(1234))
    data10 = np.array([-v for v in range(1234)])

    @data(data0, data1, data2, data3, data4, data5, data6, data7, data8, data9,
          data10)
    def test(self, data):
        os.environ['CUDA_VISIBLE_DEVICES'] = '0'
        # load library
        dl = ctypes.cdll.LoadLibrary
        quant_lib = dl("nnieqat/gpu/lib/libgfpq_gpu.so")
        _libcublas = ctypes.cdll.LoadLibrary("libcublas.so")

        # struct GFPQ_PARAM_ST in gfpq.hpp
        class GFPQ_PARAM_ST(ctypes.Structure):
            _fields_ = [("mode", ctypes.c_int), ("buf", ctypes.c_byte * 16)]

        class _types:
            """Some alias types."""
            handle = ctypes.c_void_p
            stream = ctypes.c_void_p

        data_origin = data.copy()

        print(
            "----------------------------------------------------------------------"
        )
        print("\n\nOriginal data:")
        print(data)

        data = data.astype(np.float32)
        stream = cuda.stream()

        _libcublas.cublasCreate_v2.restype = int
        _libcublas.cublasCreate_v2.argtypes = [ctypes.c_void_p]
        cublas_handle = _types.handle()
        _libcublas.cublasCreate_v2(ctypes.byref(cublas_handle))

        data_gpu = cuda.to_device(data, stream=stream)
        data_p = data_gpu.device_ctypes_pointer
        bit_width = 8

        param = GFPQ_PARAM_ST()
        # init or update param first
        param.mode = 0
        ret = quant_lib.HI_GFPQ_QuantAndDeQuant_GPU_PY(data_p, data.size,
                                                       bit_width,
                                                       ctypes.byref(param),
                                                       stream.handle,
                                                       cublas_handle)
        if ret != 0:
            print("HI_GFPQ_QuantAndDeQuant failed(%d)\n" % (ret)),

        # use apply param
        param.mode = 2
        ret = quant_lib.HI_GFPQ_QuantAndDeQuant_GPU_PY(data_p, data.size,
                                                       bit_width,
                                                       ctypes.byref(param),
                                                       stream.handle,
                                                       cublas_handle)
        if ret != 0:
            print("HI_GFPQ_QuantAndDeQuant failed(%d)" % (ret)),

        data_gpu.copy_to_host(data, stream=stream)
        # data may not be available
        stream.synchronize()
        _libcublas.cublasDestroy_v2(cublas_handle)

        import nnieqat
        from quant_impl import fake_quantize
        import torch
        tensor = torch.Tensor(data_origin).cuda()
        tensor.data = fake_quantize(tensor.data.detach(), 8)

        diff = abs(tensor.cpu().numpy() - data)
        # diff_thres = np.max(abs(data)) * 0.001
        # print("\nDIFF > 0.1%: ")
        # print("idx: ", np.where(diff > diff_thres))
        # print("Original data:", data_origin[np.where(diff > diff_thres)])
        # print("GFPQ result:", data[np.where(diff > diff_thres)])
        # print("Impl result:", tensor.cpu().numpy()[np.where(diff > diff_thres)])
        diff_max = np.max(diff)
        print("\nDIFF MAX: " + str(diff_max))
        print("\nDIFF RATIO: " +
              str(diff_max / max(np.max(abs(data)), pow(10, -18))))


if __name__ == "__main__":
    suite = unittest.TestSuite()
    suite.addTest(TestQuantImpl("test"))
    runner = unittest.TextTestRunner()
    runner.run(suite)