Full Code of aovoc/nnieqat-pytorch for AI

master 91410cf331a1 cached

36 files

75.0 KB

20.3k tokens

53 symbols

1 requests

Download .txt

Repository: aovoc/nnieqat-pytorch
Branch: master
Commit: 91410cf331a1
Files: 36
Total size: 75.0 KB

Directory structure:
gitextract_yis4nxki/

├── LICENSE.txt
├── MANIFEST.in
├── Makefile
├── README.md
├── build_helper.py
├── docker/
│   └── Dockerfile
├── docs/
│   ├── Makefile
│   ├── make.bat
│   └── source/
│       ├── build_helper.rst
│       ├── conf.py
│       ├── index.rst
│       ├── modules.rst
│       ├── nnieqat.cuda10.rst
│       ├── nnieqat.modules.rst
│       ├── nnieqat.rst
│       └── setup.rst
├── nnieqat/
│   ├── __init__.py
│   ├── cuda10/
│   │   ├── LICENSE.txt
│   │   └── lib/
│   │       ├── gfpq.lib
│   │       ├── libgfpq.a
│   │       ├── libgfpq.so.1.1.5
│   │       ├── libgfpq_gpu.a
│   │       └── libgfpq_gpu.so.1.1.5
│   └── quantize.py
├── pyproject.toml
├── setup.cfg
├── setup.py
├── src/
│   ├── fake_quantize.cpp
│   ├── fake_quantize.cu
│   ├── fake_quantize.h
│   └── test/
│       ├── Makefile
│       └── test.cu
└── tests/
    ├── test_cifar10.py
    ├── test_imagenet.py
    ├── test_merge_freeze_bn.py
    └── test_quant_impl.py

================================================
FILE CONTENTS
================================================

================================================
FILE: LICENSE.txt
================================================
MIT License

Copyright (c) Minqin Chen

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: MANIFEST.in
================================================


================================================
FILE: Makefile
================================================
# Uncomment for debugging
# DEBUG := 1
# Pretty build
# Q ?= @

CXX := g++
python := python3
PYTHON_HEADER_DIR := $(shell python -c 'from distutils.sysconfig import get_python_inc; print(get_python_inc())')
PYTORCH_INCLUDES := $(shell python -c 'from torch.utils.cpp_extension import include_paths; [print(p) for p in include_paths()]')
PYTORCH_LIBRARIES := $(shell python -c 'from torch.utils.cpp_extension import library_paths; [print(p) for p in library_paths()]')

CUDA_DIR := $(shell python -c 'from torch.utils.cpp_extension import _find_cuda_home; print(_find_cuda_home())')
WITH_ABI := $(shell python -c 'import torch; print(int(torch._C._GLIBCXX_USE_CXX11_ABI))')
INCLUDE_DIRS := ./ $(CUDA_DIR)/include
INCLUDE_DIRS += $(PYTHON_HEADER_DIR)
INCLUDE_DIRS += $(PYTORCH_INCLUDES)

# Custom (MKL/ATLAS/OpenBLAS) include and lib directories.
# BLAS_INCLUDE := /path/to/your/blas
# BLAS_LIB := /path/to/your/blas

SRC_DIR := ./src
OBJ_DIR := ./obj
CPP_SRCS := $(wildcard $(SRC_DIR)/*.cpp)
CU_SRCS := $(wildcard $(SRC_DIR)/*.cu)
OBJS := $(patsubst $(SRC_DIR)/%.cpp,$(OBJ_DIR)/%.o,$(CPP_SRCS))
CU_OBJS := $(patsubst $(SRC_DIR)/%.cu,$(OBJ_DIR)/cuda/%.o,$(CU_SRCS))
STATIC_LIB := $(OBJ_DIR)/libquant_impl.a


CUDA_ARCH := -gencode arch=compute_50,code=sm_50 \
		-gencode arch=compute_52,code=sm_52 \
		-gencode arch=compute_60,code=sm_60 \
		-gencode arch=compute_61,code=sm_61 \
		-gencode arch=compute_70,code=sm_70 \
		-gencode arch=compute_75,code=sm_75 \
		-gencode arch=compute_75,code=compute_75


LIBRARIES += stdc++ cudart c10 caffe2 torch torch_python caffe2_gpu


ifeq ($(DEBUG), 1)
	COMMON_FLAGS += -DDEBUG -g -O0
	NVCCFLAGS += -g -G # -rdc true
else
	COMMON_FLAGS += -DNDEBUG -O3
endif

WARNINGS := -Wall -Wno-sign-compare -Wcomment
INCLUDE_DIRS += $(BLAS_INCLUDE)
CXXFLAGS += -MMD -MP
COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) \
	     -DTORCH_API_INCLUDE_EXTENSION_H -D_GLIBCXX_USE_CXX11_ABI=$(WITH_ABI)
CXXFLAGS += -pthread -fPIC -fwrapv -std=c++14 $(COMMON_FLAGS) $(WARNINGS)
NVCCFLAGS += -std=c++14 -ccbin=$(CXX) -Xcompiler -fPIC -use_fast_math $(COMMON_FLAGS)

default: $(STATIC_LIB)

$(OBJ_DIR):
	@ mkdir -p $@
	@ mkdir -p $@/cuda

$(OBJ_DIR)/%.o: $(SRC_DIR)/%.cpp | $(OBJ_DIR)
	@ echo CXX $<
	$(Q)$(CXX) $< $(CXXFLAGS) -c -o $@

$(OBJ_DIR)/cuda/%.o: $(SRC_DIR)/%.cu | $(OBJ_DIR)
	@ echo NVCC $<
	$(Q)nvcc $(NVCCFLAGS) $(CUDA_ARCH) -M $< -o ${@:.o=.d} \
		-odir $(@D)
	$(Q)nvcc $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@

$(STATIC_LIB): $(OBJS) $(CU_OBJS) | $(OBJ_DIR)
	$(RM) -f $(STATIC_LIB)
	$(RM) -rf build dist
	@ echo LD -o $@
	ar rc $(STATIC_LIB) $(OBJS) $(CU_OBJS)

build:
	$(python) setup.py build

upload:
	$(python) setup.py sdist bdist_wheel
	#twine upload dist/*

clean:
	$(RM) -rf build dist nnieqat.egg-info

test:
	nosetests -s tests/test_quant_impl.py --nologcapture
	nosetests -s tests/test_merge_freeze_bn.py --nologcapture

lint:
	pylint nnieqat --reports=n

lintfull:
	pylint nnieqat

install:
	$(python) setup.py install 

uninstall:
	$(python) setup.py install --record install.log
	cat install.log | xargs rm -rf 
	$(RM) install.log


================================================
FILE: README.md
================================================
# nnieqat-pytorch

Nnieqat is a quantize aware training package for  Neural Network Inference Engine(NNIE) on pytorch, it uses hisilicon quantization library to quantize module's weight and activation as fake fp32 format.


## Table of Contents

- [nnieqat-pytorch](#nnieqat-pytorch)
  - [Table of Contents](#table-of-contents)
  - [Installation](#installation)
  - [Usage](#usage)
  - [Code Examples](#code-examples)
  - [Results](#results)
  - [Todo](#todo)
  - [Reference](#reference)


<div id="installation"></div>  

## Installation

* Supported Platforms: Linux
* Accelerators and GPUs: NVIDIA GPUs via CUDA driver ***10.1*** or ***10.2***.
* Dependencies:
  * python >= 3.5, < 4
  * llvmlite >= 0.31.0
  * pytorch >= 1.5
  * numba >= 0.42.0
  * numpy >= 1.18.1
* Install nnieqat via pypi:  
  ```shell
  $ pip install nnieqat
  ```

* Install nnieqat in docker(easy way to solve environment problems)：
  ```shell
  $ cd docker
  $ docker build -t nnieqat-image .

  ```
* Install nnieqat via repo：
  ```shell
  $ git clone https://github.com/aovoc/nnieqat-pytorch
  $ cd nnieqat-pytorch
  $ make install
  ```

<div id="usage"></div>

## Usage

* add quantization hook.

  quantize and dequantize weight and data with HiSVP GFPQ library in forward() process.

  ```python

  from nnieqat import quant_dequant_weight, unquant_weight, merge_freeze_bn, register_quantization_hook
  ...
  ...
    register_quantization_hook(model)
  ...
  ```

* merge bn weight into conv and freeze bn

  suggest finetuning from a well-trained model, merge_freeze_bn at beginning. do it after a few epochs of training otherwise.

  ```python
  from nnieqat import quant_dequant_weight, unquant_weight, merge_freeze_bn, register_quantization_hook
  ...
  ...
      model.train()
      model = merge_freeze_bn(model)  #it will change bn to eval() mode during training
  ...
  ```

* Unquantize weight before update it

  ```python
  from nnieqat import quant_dequant_weight, unquant_weight, merge_freeze_bn, register_quantization_hook
  ...
  ...
      model.apply(unquant_weight)  # using original weight while updating
      optimizer.step()
  ...
  ```

* Dump weight optimized model

  ```python
  from nnieqat import quant_dequant_weight, unquant_weight, merge_freeze_bn, register_quantization_hook
  ...
  ...
      model.apply(quant_dequant_weight)
      save_checkpoint(...)
      model.apply(unquant_weight)
  ...
  ```

* Using EMA with caution(Not recommended).

<div id="examples"></div>

## Code Examples

* [Cifar10 quantization aware training example][cifar10_qat]  (add nnieqat into [pytorch_cifar10_tutorial][cifar10_example])

  ```python test/test_cifar10.py```

* [ImageNet quantization finetuning example][imagenet_qat]  (add nnieqat into [pytorh_imagenet_main.py][imagenet_example])

  ```python test/test_imagenet.py  --pretrained  path_to_imagenet_dataset```

<div id="results"></div>

## Results  

* ImageNet

  ```
  python test/test_imagenet.py /data/imgnet/ --arch squeezenet1_1  --lr 0.001 --pretrained --epoch 10   # nnie_lr_e-3_ft
  python pytorh_imagenet_main.py /data/imgnet/ --arch squeezenet1_1  --lr 0.0001 --pretrained --epoch 10  # lr_e-4_ft
  python test/test_imagenet.py /data/imgnet/ --arch squeezenet1_1  --lr 0.0001 --pretrained --epoch 10  # nnie_lr_e-4_ft
  ```

  finetune result：

    |     | trt_fp32 | trt_int8     | nnie     |
    | -------- |  -------- | -------- | -------- |
    | torchvision     | 0.56992  | 0.56424  | 0.56026 |
    | nnie_lr_e-3_ft | 0.56600   | 0.56328   | 0.56612 |
    | lr_e-4_ft  | 0.57884   | 0.57502   | 0.57542 |
    | nnie_lr_e-4_ft | 0.57834   | 0.57524   | 0.57730 |  


* coco

net: simplified  yolov5s

train 300 epoches, hi3559 test result:   

 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.338   
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.540   
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.357   
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.187   
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.377   
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.445   
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.284   
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.484   
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.542   
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.357   
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.595   
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.679   


finetune 20 epoches, hi3559 test result:   

 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.339   
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.539   
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.360   
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.191   
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.378   
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.446   
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.285   
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.485   
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.544   
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.361   
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.596   
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.683   



<div id="Todo"></div>

## Todo

* Generate quantized model directly.

<div id="reference"></div>  

## Reference

HiSVP 量化库使用指南

[Quantizing deep convolutional networks for efficient inference: A whitepaper][quant_whitepaper]

[8-bit Inference with TensorRT][trt_quant]

[Distilling the Knowledge in a Neural Network][distillingNN]

[cifar10_qat]: https://github.com/aovoc/nnieqat-pytorch/blob/master/test/test_cifar10.py

[imagenet_qat]: https://github.com/aovoc/nnieqat-pytorch/blob/master/test/test_imagenet.py

[imagenet_example]: https://github.com/pytorch/examples/blob/master/imagenet/main.py

[cifar10_example]: https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html

[quant_whitepaper]: https://arxiv.org/abs/1806.08342

[trt_quant]: https://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf

[distillingNN]: https://arxiv.org/abs/1503.02531


================================================
FILE: build_helper.py
================================================
import os
import shutil
import subprocess
import sys
import tempfile
from distutils import ccompiler


def print_warning(*lines):
    print('**************************************************')
    for line in lines:
        print('*** WARNING: %s' % line)
    print('**************************************************')


def get_path(key):
    return os.environ.get(key, '').split(os.pathsep)


def search_on_path(filenames):
    for p in get_path('PATH'):
        for filename in filenames:
            full = os.path.join(p, filename)
            if os.path.exists(full):
                return os.path.abspath(full)
    return None


minimum_cuda_version = 10010
maxinum_cuda_version = 10030
minimum_cudnn_version = 7000


def get_compiler_setting():
    nvcc_path = search_on_path(('nvcc', 'nvcc.exe'))
    cuda_path_default = None
    if nvcc_path is None:
        print_warning('nvcc not in path.', 'Please set path to nvcc.')
    else:
        cuda_path_default = os.path.normpath(
            os.path.join(os.path.dirname(nvcc_path), '..'))

    cuda_path = os.environ.get('CUDA_PATH', '')  # Nvidia default on Windows
    if len(cuda_path) > 0 and cuda_path != cuda_path_default:
        print_warning('nvcc path != CUDA_PATH',
                      'nvcc path: %s' % cuda_path_default,
                      'CUDA_PATH: %s' % cuda_path)

    if not os.path.exists(cuda_path):
        cuda_path = cuda_path_default

    if not cuda_path and os.path.exists('/usr/local/cuda'):
        cuda_path = '/usr/local/cuda'

    include_dirs = []
    library_dirs = []
    define_macros = []

    if cuda_path:
        include_dirs.append(os.path.join(cuda_path, 'include'))
        if sys.platform == 'win32':
            library_dirs.append(os.path.join(cuda_path, 'bin'))
            library_dirs.append(os.path.join(cuda_path, 'lib', 'x64'))
        else:
            library_dirs.append(os.path.join(cuda_path, 'lib64'))
            library_dirs.append(os.path.join(cuda_path, 'lib'))
    if sys.platform == 'darwin':
        library_dirs.append('/usr/local/cuda/lib')

    return {
        'include_dirs': include_dirs,
        'library_dirs': library_dirs,
        'define_macros': define_macros,
        'language': 'c++',
    }


def check_cuda_version():
    compiler = ccompiler.new_compiler()
    settings = get_compiler_setting()
    try:
        out = build_and_run(compiler,
                            '''
        #include <cuda.h>
        #include <stdio.h>
        int main(int argc, char* argv[]) {
          printf("%d", CUDA_VERSION);
          return 0;
        }
        ''',
                            include_dirs=settings['include_dirs'])

    except Exception as e:
        print_warning('Cannot check CUDA version', str(e))
        return False

    cuda_version = int(out)
    if cuda_version < minimum_cuda_version:
        print_warning('CUDA version is too old: %d' % cuda_version,
                      'CUDA v10.1 or CUDA v10.2 is required')
        return False
    if cuda_version > maxinum_cuda_version:
        print_warning('CUDA version is too new: %d' % cuda_version,
                      'CUDA v10.1 or CUDA v10.2 is required')

    return True


def check_cudnn_version():
    compiler = ccompiler.new_compiler()
    settings = get_compiler_setting()
    try:
        out = build_and_run(compiler,
                            '''
        #include <cudnn.h>
        #include <stdio.h>
        int main(int argc, char* argv[]) {
          printf("%d", CUDNN_VERSION);
          return 0;
        }
        ''',
                            include_dirs=settings['include_dirs'])

    except Exception as e:
        print_warning('Cannot check cuDNN version\n{0}'.format(e))
        return False

    cudnn_version = int(out)
    if cudnn_version < minimum_cudnn_version:
        print_warning('cuDNN version is too old: %d' % cudnn_version,
                      'cuDNN v7 or newer is required')
        return False

    return True


def build_and_run(compiler,
                  source,
                  libraries=(),
                  include_dirs=(),
                  library_dirs=()):
    temp_dir = tempfile.mkdtemp()

    try:
        fname = os.path.join(temp_dir, 'a.cpp')
        with open(fname, 'w') as f:
            f.write(source)

        objects = compiler.compile([fname],
                                   output_dir=temp_dir,
                                   include_dirs=include_dirs)

        try:
            postargs = ['/MANIFEST'] if sys.platform == 'win32' else []
            compiler.link_executable(objects,
                                     os.path.join(temp_dir, 'a'),
                                     libraries=libraries,
                                     library_dirs=library_dirs,
                                     extra_postargs=postargs,
                                     target_lang='c++')
        except Exception as e:
            msg = 'Cannot build a stub file.\nOriginal error: {0}'.format(e)
            raise Exception(msg)

        try:
            out = subprocess.check_output(os.path.join(temp_dir, 'a'))
            return out

        except Exception as e:
            msg = 'Cannot execute a stub file.\nOriginal error: {0}'.format(e)
            raise Exception(msg)

    finally:
        shutil.rmtree(temp_dir, ignore_errors=True)


================================================
FILE: docker/Dockerfile
================================================
ARG PYTORCH="1.6.0"
ARG CUDA="10.1"
ARG CUDNN="7"

FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel

ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0+PTX"
ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"

RUN apt-get update && apt-get install -y git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*

# Install nnieqat
RUN pip install nnieqat

WORKDIR /root/


================================================
FILE: docs/Makefile
================================================
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS    ?=
SPHINXBUILD   ?= sphinx-build
SOURCEDIR     = source
BUILDDIR      = build

# Put it first so that "make" without argument is like "make help".
help:
	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


================================================
FILE: docs/make.bat
================================================
@ECHO OFF

pushd %~dp0

REM Command file for Sphinx documentation

if "%SPHINXBUILD%" == "" (
	set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build

if "%1" == "" goto help

%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
	echo.
	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
	echo.installed, then set the SPHINXBUILD environment variable to point
	echo.to the full path of the 'sphinx-build' executable. Alternatively you
	echo.may add the Sphinx directory to PATH.
	echo.
	echo.If you don't have Sphinx installed, grab it from
	echo.http://sphinx-doc.org/
	exit /b 1
)

%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end

:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%

:end
popd


================================================
FILE: docs/source/build_helper.rst
================================================
build\_helper module
====================

.. automodule:: build_helper
   :members:
   :undoc-members:
   :show-inheritance:


================================================
FILE: docs/source/conf.py
================================================
# -*- coding: utf-8 -*-
#
import os
import sys
sys.path.insert(0, os.path.abspath('./../../'))


# -- Project information -----------------------------------------------------

project = 'nnieqat'
copyright = '2020, Minqin Chen'
author = 'Minqin Chen'

# The short X.Y version
version = ''
# The full version, including alpha/beta/rc tags
release = '0.1.0'


# -- General configuration ---------------------------------------------------

# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
    'sphinx.ext.todo',
    'sphinx.ext.githubpages',
    'sphinx.ext.autodoc',
]

# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']

# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
source_suffix = '.rst'

# The master toctree document.
master_doc = 'index'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path .
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']

# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'


# -- Options for HTML output -------------------------------------------------

# The theme to use for HTML and HTML Help pages.  See the documentation for
# a list of builtin themes.
#

# Theme options are theme-specific and customize the look and feel of a theme
# further.  For a list of options available for each theme, see the
# documentation.
#
# html_theme_options = {}

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']

# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
#
# The default sidebars (for documents that don't match any pattern) are
# defined by theme itself.  Builtin themes are using these templates by
# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
# 'searchbox.html']``.
#
# html_sidebars = {}
html_theme = 'sphinx_rtd_theme'


================================================
FILE: docs/source/index.rst
================================================
.. nnieqat documentation master file, created by
   sphinx-quickstart on Fri Aug 21 03:52:34 2020.
   You can adapt this file completely to your liking, but it should at least
   contain the root `toctree` directive.

Welcome to nnieqat's documentation!
===================================

.. toctree::
   :maxdepth: 2
   :caption: Contents:



Indices and tables
==================

* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`


================================================
FILE: docs/source/modules.rst
================================================
nnieqat
=======

.. toctree::
   :maxdepth: 4

   nnieqat


================================================
FILE: docs/source/nnieqat.cuda10.rst
================================================
nnieqat.cuda10 package
======================

Submodules
----------

nnieqat.cuda10.quantize module
------------------------------

.. automodule:: nnieqat.cuda10.quantize
   :members:
   :undoc-members:
   :show-inheritance:


Module contents
---------------

.. automodule:: nnieqat.cuda10
   :members:
   :undoc-members:
   :show-inheritance:


================================================
FILE: docs/source/nnieqat.modules.rst
================================================
nnieqat.modules package
=======================

Submodules
----------

nnieqat.modules.conv module
---------------------------

.. automodule:: nnieqat.modules.conv
   :members:
   :undoc-members:
   :show-inheritance:

nnieqat.modules.linear module
-----------------------------

.. automodule:: nnieqat.modules.linear
   :members:
   :undoc-members:
   :show-inheritance:

nnieqat.modules.pooling module
------------------------------

.. automodule:: nnieqat.modules.pooling
   :members:
   :undoc-members:
   :show-inheritance:


Module contents
---------------

.. automodule:: nnieqat.modules
   :members:
   :undoc-members:
   :show-inheritance:


================================================
FILE: docs/source/nnieqat.rst
================================================
nnieqat package
===============

Subpackages
-----------

.. toctree::

   nnieqat.cuda10
   nnieqat.gpu
   nnieqat.modules

Module contents
---------------

.. automodule:: nnieqat
   :members:
   :undoc-members:
   :show-inheritance:


================================================
FILE: docs/source/setup.rst
================================================
setup module
============

.. automodule:: setup
   :members:
   :undoc-members:
   :show-inheritance:


================================================
FILE: nnieqat/__init__.py
================================================
""" quantize aware training package for  Neural Network Inference Engine(NNIE) on pytorch.
"""
import sys
try:
    from .quantize import quant_dequant_weight, unquant_weight, freeze_bn, \
        merge_freeze_bn, register_quantization_hook, test
except:
    raise
__all__ = [
    "quant_dequant_weight", "unquant_weight", "freeze_bn", "merge_freeze_bn", \
        "register_quantization_hook", "test"]
test()


================================================
FILE: nnieqat/cuda10/LICENSE.txt
================================================
/*
 * Copyright (c) 2018, Hisilicon Limited
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice,
 * this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */


================================================
FILE: nnieqat/quantize.py
================================================
#!/usr/bin/env python
"""Quantize function.
"""

import ctypes
import datetime
import logging
from os.path import abspath, dirname
import torch
import numpy as np
from numba import cuda
from quant_impl import fake_quantize

_USE_GFPQ_QUANT_LIB = (torch.cuda.device_count() <= 1)


class GFPQParamSt(ctypes.Structure):
    r"""GFPQ param, corresponds with struct GFPQ_PARAM_ST in gfpq.hpp"""
    _fields_ = [("mode", ctypes.c_int), ("param", ctypes.c_byte * 16)]


class _types:
    r"""Some alias types."""
    handle = ctypes.c_void_p
    stream = ctypes.c_void_p


class QuantAndDeQuantGPU():
    r"""quantize and dequantize data with GFPG library.
    """
    def __init__(self,
                 libquant_path=dirname(abspath(__file__)) +
                 "/gpu/lib/libgfpq_gpu.so",
                 libcublas_path="libcublas.so",
                 bit_width=8,
                 param_mode=0):
        global _USE_GFPQ_QUANT_LIB
        self._bit_width = bit_width
        if _USE_GFPQ_QUANT_LIB:
            self._libquant = ctypes.cdll.LoadLibrary(libquant_path)
            self._libcublas = ctypes.cdll.LoadLibrary(libcublas_path)
            self._libcublas.cublasCreate_v2.restype = int
            self._libcublas.cublasCreate_v2.argtypes = [ctypes.c_void_p]
            self._cublas_handle = _types.handle()
            self._libcublas.cublasCreate_v2(ctypes.byref(self._cublas_handle))
            self._param = GFPQParamSt()
            self._stream = cuda.stream()
            self._param.mode = param_mode

    def __call__(self, tensor, mode=0):
        r""" Converts float weights to quantized weights.

        Args:
            - tensor: input data
            - mode: GFPQ mode for param
                GFPQ_MODE_INIT(0): There is no valid parameter in param[].
                    Generate the parameter and filled in param[].
                GFPQ_MODE_UPDATE(1): There is parameter in param[]. Generate
                    new parameter, update param[] when the new parameter is
                    better.
                GFPQ_MODE_APPLY_ONLY(2): There is parameter in param[]. Don't
                    generate parameter. Just use the param[].
        """

        global _USE_GFPQ_QUANT_LIB
        if _USE_GFPQ_QUANT_LIB:
            try:
                if isinstance(tensor, tuple):
                    for tensor_item in tensor:
                        data_cuda_array = cuda.as_cuda_array(
                            tensor_item.data.detach())
                        data_p = data_cuda_array.device_ctypes_pointer
                        self._param.mode = mode
                        ret = self._libquant.HI_GFPQ_QuantAndDeQuant_GPU_PY(
                            data_p, data_cuda_array.size, self._bit_width,
                            ctypes.byref(self._param), self._stream.handle,
                            self._cublas_handle)
                else:
                    data_cuda_array = cuda.as_cuda_array(tensor.data.detach())
                    data_p = data_cuda_array.device_ctypes_pointer
                    self._param.mode = mode
                    ret = self._libquant.HI_GFPQ_QuantAndDeQuant_GPU_PY(
                        data_p, data_cuda_array.size, self._bit_width,
                        ctypes.byref(self._param), self._stream.handle,
                        self._cublas_handle)
            except:
                pass
            finally:
                if ret != 0:
                    _USE_GFPQ_QUANT_LIB = False
                    logger = logging.getLogger(__name__)
                    logger.setLevel(logging.WARNING)
                    logger.warning(
                        """Failed to quantize data with default HiSVP GFPQ library,
                        Use implemented quantization algorithm instead.""")
                    if isinstance(tensor, tuple):
                        for tensor_item in tensor:
                            tensor_item.data = fake_quantize(
                                tensor_item.data.detach().clone(), self._bit_width)
                    else:
                        tensor.data = fake_quantize(tensor.data.detach().clone(),
                                                    self._bit_width)
        else:
            if isinstance(tensor, tuple):
                for tensor_item in tensor:
                    tensor_item.data = fake_quantize(tensor_item.data.detach().clone(),
                                                     self._bit_width)
            else:
                tensor.data = fake_quantize(tensor.data.detach().clone(),
                                            self._bit_width)
        return tensor


_QUANT_HANDLE = QuantAndDeQuantGPU()


def _fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b):
    """ fuse convolution and batch norm's weight.

    Args:
        conv_w (torch.nn.Parameter): convolution weight.
        conv_b (torch.nn.Parameter): convolution bias.
        bn_rm (torch.nn.Parameter): batch norm running mean.
        bn_rv (torch.nn.Parameter): batch norm running variance.
        bn_eps (torch.nn.Parameter): batch norm epsilon.
        bn_w (torch.nn.Parameter): batch norm weight.
        bn_b (torch.nn.Parameter): batch norm weight.

    Returns:
        conv_w(torch.nn.Parameter): fused convolution weight.
        conv_b(torch.nn.Parameter): fused convllution bias.
    """

    if conv_b is None:
        conv_b = bn_rm.new_zeros(bn_rm.shape)
    bn_var_rsqrt = torch.rsqrt(bn_rv + bn_eps)

    conv_w = conv_w * \
        (bn_w * bn_var_rsqrt).reshape([-1] + [1] * (len(conv_w.shape) - 1))
    conv_b = (conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b

    return torch.nn.Parameter(conv_w), torch.nn.Parameter(conv_b)


def _fuse_conv_bn(conv, bn):
    conv.weight, conv.bias = \
        _fuse_conv_bn_weights(conv.weight, conv.bias,
                             bn.running_mean, bn.running_var, bn.eps, bn.weight, bn.bias)
    return conv


def _fuse_modules(model):
    r"""Fuses a list of modules into a single module

    Fuses only the following sequence of modules:
    conv, bn
    All other sequences are left unchanged.
    For these sequences, fuse modules on weight level, keep model structure unchanged.

    Arguments:
        model: Model containing the modules to be fused

    Returns:
        model with fused modules.

    """
    children = list(model.named_children())
    conv_module = None
    conv_name = None

    for name, child in children:
        if isinstance(child, (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d,
                              torch.nn.BatchNorm3d)):
            if isinstance(conv_module, (torch.nn.Conv2d, torch.nn.Conv3d)):
                conv_module = _fuse_conv_bn(conv_module, child)
                model._modules[conv_name] = conv_module
                child.eval()
                child.running_mean = child.running_mean.new_full(
                    child.running_mean.shape, 0)
                child.running_var = child.running_var.new_full(
                    child.running_var.shape, 1)
                if child.weight is not None:
                    child.weight.data = child.weight.data.new_full(
                        child.weight.shape, 1)
                if child.bias is not None:
                    child.bias.data = child.bias.data.new_full(
                        child.bias.shape, 0)
                child.track_running_stats = False
                child.momentum = 0
                child.eps = 0
            conv_module = None
        elif isinstance(child, (torch.nn.Conv2d, torch.nn.Conv3d)):
            conv_module = child
            conv_name = name
        else:
            _fuse_modules(child)
    return model


def freeze_bn(m, freeze_bn_affine=True):
    """Freeze batch normalization.
        reference: https://arxiv.org/abs/1806.08342


    Args:
        - m (nn.module): torch module
        - freeze_bn_affine (bool, optional): Freeze affine scale and
        translation factor or not. Defaults: True.
    """

    if isinstance(
            m,
        (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d)):

        m.eval()
        if freeze_bn_affine:
            m.weight.requires_grad = False
            m.bias.requires_grad = False


def merge_freeze_bn(model):
    """merge batch norm's weight into convolution, then freeze it.

    Args:
        model (nn.module): model.

    Returns:
        [nn.module]: model.
    """
    model = _fuse_modules(model)
    model.apply(freeze_bn)
    return model


def unquant_weight(m):
    """ unquantize weight before update weight, avoid training turbulence.

    Args:
        - m (nn.module): torch module.
    """
    try:
        if hasattr(m, "weight_origin") and m.weight is not None:
            m.weight.data.copy_(m.weight_origin.data)
    except AttributeError:
        pass
    except TypeError:
        pass


def quant_dequant_weight(m):
    """ quant weight manually.

    Args:
        - m (nn.module): torch module.
    """
    global _QUANT_HANDLE
    global _USE_GFPQ_QUANT_LIB
    quant_handle = _QUANT_HANDLE
    if not _USE_GFPQ_QUANT_LIB:
        quant_handle = QuantAndDeQuantGPU()
    try:
        if hasattr(m, "weight_origin") and m.weight is not None:
            m.weight_origin.data.copy_(m.weight.data)
            m.weight.data = quant_handle(m.weight.data.detach().clone())
    except AttributeError:
        pass
    except TypeError:
        pass


def _quantizing_activation(module, input, output):
    if isinstance(
            module,
        (torch.nn.ReLU, torch.nn.ELU, torch.nn.LeakyReLU, torch.nn.PReLU)):
        global _QUANT_HANDLE
        global _USE_GFPQ_QUANT_LIB
        quant_handle = _QUANT_HANDLE
        if not _USE_GFPQ_QUANT_LIB:
            quant_handle = QuantAndDeQuantGPU()
        # print("quantizing activation.")
        # print(output[0][0][0])
        output_type = output.dtype
        module.activation_max_value = torch.max(torch.max(torch.abs(output.detach())), module.activation_max_value.to(output_type))
        # print(module.activation_max_value)
        tensor_t = torch.cat((output, torch.ones(output[0].shape).cuda().unsqueeze(0) * module.activation_max_value))
        output.data = quant_handle(tensor_t.float())[:-1]
        output = output.to(output_type)
        # print(output[0][0][0])


def _quantizing_data(module, input):
    global _QUANT_HANDLE
    global _USE_GFPQ_QUANT_LIB
    quant_handle = _QUANT_HANDLE
    if not _USE_GFPQ_QUANT_LIB:
        quant_handle = QuantAndDeQuantGPU()
    # print("quantizing data.")
    # print(input[0][0][0])
    # print("quantizing data.")
    # print(input[0][0][0])
    # input_type = input.dtype
    if isinstance(input, tuple):
        for item in input:
            item_type = item.dtype
            item = quant_handle(item.float())
            item.to(item_type)
    else:
        input = quant_handle(input.float())
    # input = input.to(input_type)
    # print(input[0][0][0])


def _quantizing_weight(module, input):
    global _QUANT_HANDLE
    global _USE_GFPQ_QUANT_LIB
    quant_handle = _QUANT_HANDLE
    if not _USE_GFPQ_QUANT_LIB:
        quant_handle = QuantAndDeQuantGPU()
    # print("quantizing weight.")
    # print(module.weight[0][0][0])
    module.weight_origin.data.copy_(module.weight.data)
    module.weight.data = quant_handle(module.weight.data.detach().clone())
    # print(module.weight[0][0][0])


def register_quantization_hook(model,
                               quant_weight=True,
                               quant_activation=True,
                               quant_data=False):
    """register quantization hook for model.

    Args:
        model (:class:`Module`): Module.

    Returns:
        Module: self
    """

    #  weight quantizing.
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)

    for _, module in model._modules.items():
        if len(list(module.children())) > 0:
            register_quantization_hook(module, quant_weight, quant_activation)
        else:
            if quant_weight and hasattr(
                    module,
                    "weight") and module.weight is not None and not isinstance(
                        module, (torch.nn.BatchNorm1d, torch.nn.BatchNorm2d,
                                 torch.nn.BatchNorm3d)):
                module.register_buffer('weight_origin', module.weight.detach().clone())
                if quant_data:
                    module.register_forward_pre_hook(_quantizing_data)
                    logger.info("Quantizing input data of %s", str(module))
                module.register_forward_pre_hook(_quantizing_weight)
                logger.info("Quantizing weight of %s", str(module))

            if quant_activation and isinstance(
                    module, (torch.nn.ReLU, torch.nn.ELU, torch.nn.LeakyReLU, torch.nn.PReLU)):
                module.register_buffer("activation_max_value", torch.tensor(0, dtype=torch.float).cuda())
                module.register_forward_hook(_quantizing_activation)
                logger.info("Quantizing activation of %s", str(module))

    return model


def test():
    r""" Test GFPG library QuantAndDeQuantGPU.
    """
    quant_handle = QuantAndDeQuantGPU()
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    tensor = torch.Tensor(np.array([-9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])).cuda()
    logging.info("Origin Data: ")
    logging.info(tensor)

    start_time = datetime.datetime.now()
    quant_tensor = quant_handle(tensor)
    end_time = datetime.datetime.now()

    logging.info("Quant Data: ")
    logging.info(quant_tensor)

    data_expected = np.array([
        -8.7240619659, 0.0000000000, 1.0000000000, 2.0000000000, 2.9536523819,
        4.0000000000, 4.9674310684, 5.9073047638, 7.0250086784, 8.0000000000,
        8.7240619659
    ])

    logging.info("Data expected:  ")
    logging.info(" ".join([str(v) for v in data_expected]))

    data_diff = quant_tensor.data.detach().cpu().numpy() - data_expected
    flag = "success."
    for num in data_diff:
        if abs(num) > 0.000000001:
            flag = "failed."

    run_time = end_time - start_time
    logging.info("QuantAndDeQuantGPU time: %s", str(run_time))
    logging.info("QuantAndDeQuantGPU %s", flag)


================================================
FILE: pyproject.toml
================================================
[build-system]
requires = ["setuptools>=40.8.0", "wheel"]
build-backend = "setuptools.build_meta"


================================================
FILE: setup.cfg
================================================
[metadata]
license_files = LICENSE.txt


================================================
FILE: setup.py
================================================
from setuptools import setup, find_packages
import pathlib
from torch.utils.cpp_extension import BuildExtension, CUDAExtension

from build_helper import check_cuda_version
assert(check_cuda_version())

import os
os.system('make -j%d' % os.cpu_count())

here = pathlib.Path(__file__).parent.resolve()
long_description = (here / 'README.md').read_text(encoding='utf-8')

setup(
    name='nnieqat',
    version='0.1.0',
    description='A nnie quantization aware training tool on pytorch.',
    long_description=long_description,
    long_description_content_type='text/markdown',
    url='https://github.com/aovoc/nnieqat-pytorch',
    author='Minqin Chen',
    author_email='minqinchen@deepglint.com',
    license='MIT',
    classifiers=[
        'Development Status :: 5 - Production/Stable',
        "Intended Audience :: Science/Research",
        'Intended Audience :: Developers',
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
        "Topic :: Software Development :: Libraries :: Python Modules",
        'License :: OSI Approved :: MIT License',
        'Programming Language :: Python :: 3',
        'Programming Language :: Python :: 3.5',
        'Programming Language :: Python :: 3.6',
        'Programming Language :: Python :: 3.7',
        'Programming Language :: Python :: 3.8',
        'Programming Language :: Python :: 3 :: Only',
    ],
    keywords=[
        "quantization aware training",
        "deep learning",
        "neural network",
        "CNN",
        "machine learning",
    ],
    packages=find_packages(),
    package_data={
        "nnieqat": ["gpu/lib/*gfpq*"],
    },
    python_requires='>=3.5, <4',
    install_requires=[
        "torch>=1.5",
        "numba>=0.42.0",
        "numpy>=1.18.1"
    ],
    extras_require={
        'test': ["torchvision>=0.4",
                 "nose",
                 "ddt"
                 ],
        'docs': [
            'sphinx==2.4.4',
            'sphinx_rtd_theme'
        ]
    },
    ext_modules=[
        CUDAExtension(
            name="quant_impl",
            sources=[
                "./src/fake_quantize.cpp",
            ],
            libraries=['quant_impl'],
            library_dirs=['obj'],
        )
    ],
    cmdclass={'build_ext': BuildExtension},
    test_suite="nnieqat.test.test_cifar10",
)


================================================
FILE: src/fake_quantize.cpp
================================================
#include "fake_quantize.h"

#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

Tensor fake_quantize(Tensor a, int bit_width){
  CHECK_INPUT(a);
  return fake_quantize_cuda(a, bit_width);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){
  m.def("fake_quantize", &fake_quantize, "NNIE Fake Quantization (CUDA)");
}

================================================
FILE: src/fake_quantize.cu
================================================
#include "fake_quantize.h"
__global__ void fake_quantize_kernel_cuda(float* __restrict__ a,
                                            float* o, int size,
                                            float* max_entry,
                                            int bit_width) {
    if(bit_width!=8) bit_width =16;
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (index < size) {
        if((*max_entry) < 1e-15 && (*max_entry) > -1e-15){
            o[index] = 0;
            return;
        }

        if(bit_width == 8){
            float data_max = (*max_entry);
            int max_entry_qdata_int =  floorf(__log2f(data_max) * 16) + 1;
            data_max = __powf(2, __fdividef(max_entry_qdata_int, 16));
            float data_max_floor = __powf(2, __fdividef(max_entry_qdata_int-1, 16));

            if(a[index] <= data_max_floor * 0.0020395972313035  // exp(ln(256) / 128) / 512= 2^(1/16-9) = 1.0442737824274 /512 = 0.0020395972313035
                && a[index] > - data_max * 0.0020395972313035){  
                o[index] = 0;
                return;
            }

            //int qdata_int = (int)(log(256 * a[index] / data_max ) / 0.04332169878499658);  //ln(256) / 128 =  0.04332169878499658
            int qdata_int = 0;
            if(a[index] > 0){
                qdata_int = rintf(__fdividef(  __logf(__fdividef(256* a[index],data_max)), 0.04332169878499658));  //ln(256) / 128 =  0.04332169878
                if(qdata_int > 127) qdata_int = 127;
                else if(qdata_int < 0) qdata_int = 0;   
                o[index] =  __fdividef(data_max , 256.0) *  __expf(qdata_int*0.04332169878499658);   
            }
            else{
                qdata_int = - rintf(__fdividef(  __logf(__fdividef(- 256* a[index], data_max)), 0.04332169878499658));  //ln(256) / 128 =  0.04332169878
                if(qdata_int < -127) qdata_int = -127;
                else if(qdata_int >-1) qdata_int = -1;
                o[index] = - __fdividef(data_max , 256.0) * __expf(- qdata_int*0.04332169878499658);
            }

        }
        else{
            float data_max = (*max_entry);
            int max_entry_qdata_int =  floorf(__log2f(data_max) * 128) + 1;
            data_max = __powf(2, __fdividef(max_entry_qdata_int, 128));
            float data_max_floor = __powf(2, __fdividef(max_entry_qdata_int-1, 16));

            
            if(a[index] < data_max_floor *0.0019537861485404  //exp(ln(2^16)/(2^15)) / 512 = 0.0019537861485404
                && a[index] > - data_max * 0.0019537861485404){ 
                o[index] = 0;
                return;
            }

            int qdata_int = 0;
            if(a[index] > 0){
                qdata_int = rintf(__fdividef(  __logf(__fdividef(65536* a[index], data_max)), 0.00033845077175779)); 
                if(qdata_int > 32767) qdata_int = 32767;
                else if(qdata_int <0) qdata_int = 0;
                o[index] =  __fdividef(data_max , 65536.0) * __expf(qdata_int * 0.00033845077175779); 
            }
            else{
                qdata_int = - rintf(__fdividef(  __logf(__fdividef(- 65536* a[index], data_max)), 0.00033845077175779));
                if(qdata_int < -32767) qdata_int = -32767;
                else if(qdata_int >-1) qdata_int = -1;
                o[index] = - __fdividef(data_max , 65536.0) * __expf(- qdata_int * 0.00033845077175779);  
            }
        }

    }
}


Tensor fake_quantize_cuda(Tensor a, int bit_width) {
    auto o = at::zeros_like(a);
    int64_t size = a.numel();
  
    Tensor max_entry = at::max(at::abs(a));
    int blockSize = 1024;
    int blockNums = (size + blockSize - 1) / blockSize;
  
    fake_quantize_kernel_cuda<<<blockNums, blockSize>>>(a.data_ptr<float>(),
                                                        o.data_ptr<float>(),
                                                        size,
                                                        max_entry.data_ptr<float>(),
                                                        bit_width);
    return o;
  }



================================================
FILE: src/fake_quantize.h
================================================
#include <cstdlib>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <climits>
#include <stdint.h>
#include <tuple>
#include <ATen/ATen.h>
#include <torch/torch.h>

using namespace at;

Tensor fake_quantize(Tensor a, int bit_width=8);

Tensor fake_quantize_cuda(Tensor a, int bit_width=8);

__global__ void fake_quantize_kernel_cuda(float* __restrict__ a,
                                            float* o, int size,
                                            float* max_entry,
                                            int bit_width=8);


================================================
FILE: src/test/Makefile
================================================
# Uncomment for debugging
DEBUG := 1
# Pretty build
# Q ?= @

CXX := g++
python := python3
PYTHON_HEADER_DIR := $(shell python -c 'from distutils.sysconfig import get_python_inc; print(get_python_inc())')
PYTORCH_INCLUDES := $(shell python -c 'from torch.utils.cpp_extension import include_paths; [print(p) for p in include_paths()]')
PYTORCH_LIBRARIES := $(shell python -c 'from torch.utils.cpp_extension import library_paths; [print(p) for p in library_paths()]')

CUDA_DIR := $(shell python -c 'from torch.utils.cpp_extension import _find_cuda_home; print(_find_cuda_home())')
WITH_ABI := $(shell python -c 'import torch; print(int(torch._C._GLIBCXX_USE_CXX11_ABI))')
INCLUDE_DIRS := ./ $(CUDA_DIR)/include
INCLUDE_DIRS += $(PYTHON_HEADER_DIR)
INCLUDE_DIRS += $(PYTORCH_INCLUDES)

# Custom (MKL/ATLAS/OpenBLAS) include and lib directories.
# BLAS_INCLUDE := /path/to/your/blas
# BLAS_LIB := /path/to/your/blas

SRC_DIR := ./
OBJ_DIR := ./obj
CPP_SRCS := $(wildcard $(SRC_DIR)/*.cpp)
CU_SRCS := $(wildcard $(SRC_DIR)/*.cu)
OBJS := $(patsubst $(SRC_DIR)/%.cpp,$(OBJ_DIR)/%.o,$(CPP_SRCS))
CU_OBJS := $(patsubst $(SRC_DIR)/%.cu,$(OBJ_DIR)/cuda/%.o,$(CU_SRCS))
STATIC_LIB := $(OBJ_DIR)/libquant_impl.a


CUDA_ARCH := -gencode arch=compute_50,code=sm_50 \
		-gencode arch=compute_52,code=sm_52 \
		-gencode arch=compute_60,code=sm_60 \
		-gencode arch=compute_61,code=sm_61 \
		-gencode arch=compute_70,code=sm_70 \
		-gencode arch=compute_75,code=sm_75 \
		-gencode arch=compute_75,code=compute_75


LIBRARIES += stdc++ cudart c10 caffe2 torch torch_python caffe2_gpu


ifeq ($(DEBUG), 1)
	COMMON_FLAGS += -DDEBUG -g -O0
	NVCCFLAGS += -g -G # -rdc true
else
	COMMON_FLAGS += -DNDEBUG -O3
endif

WARNINGS := -Wall -Wno-sign-compare -Wcomment
INCLUDE_DIRS += $(BLAS_INCLUDE)
CXXFLAGS += -MMD -MP
COMMON_FLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) \
	     -DTORCH_API_INCLUDE_EXTENSION_H -D_GLIBCXX_USE_CXX11_ABI=$(WITH_ABI)
CXXFLAGS += -pthread -fPIC -fwrapv -std=c++14 $(COMMON_FLAGS) $(WARNINGS)
NVCCFLAGS += -std=c++14 -ccbin=$(CXX) -Xcompiler -fPIC -use_fast_math $(COMMON_FLAGS)

default: $(STATIC_LIB)

$(OBJ_DIR):
	@ mkdir -p $@
	@ mkdir -p $@/cuda

$(OBJ_DIR)/%.o: $(SRC_DIR)/%.cpp | $(OBJ_DIR)
	@ echo CXX $<
	$(Q)$(CXX) $< $(CXXFLAGS) -c -o $@

$(OBJ_DIR)/cuda/%.o: $(SRC_DIR)/%.cu | $(OBJ_DIR)
	@ echo NVCC $<
	$(Q)nvcc $(NVCCFLAGS) $(CUDA_ARCH) -M $< -o ${@:.o=.d} \
		-odir $(@D)
	$(Q)nvcc $(NVCCFLAGS) $(CUDA_ARCH) -c $< -o $@

$(STATIC_LIB): $(OBJS) $(CU_OBJS) | $(OBJ_DIR)
	$(RM) -f $(STATIC_LIB)
	$(RM) -rf build dist
	@ echo LD -o $@
	ar rc $(STATIC_LIB) $(OBJS) $(CU_OBJS)

build:
	$(python) setup.py build

upload:
	$(python) setup.py sdist bdist_wheel
	#twine upload dist/*

clean:
	$(RM) -rf build dist nnieqat.egg-info obj

test:
	nosetests -s tests/test_quant_impl.py --nologcapture

lint:
	pylint nnieqat --reports=n

lintfull:
	pylint nnieqat

install:
	$(python) setup.py install 

uninstall:
	$(python) setup.py install --record install.log
	cat install.log | xargs rm -rf 
	$(RM) install.log


================================================
FILE: src/test/test.cu
================================================
#include <stdio.h>
#include "../fake_quantize.h"

int main(int argc, char *argv[])
{
	Tensor input = randn({2, 2});
	fake_quantize(input, 8);
	return 0;
}


================================================
FILE: tests/test_cifar10.py
================================================
# -*- coding:utf-8 -*-
from nnieqat import quant_dequant_weight, unquant_weight, merge_freeze_bn, register_quantization_hook
import unittest
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import torchvision
import torchvision.transforms as transforms



class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = torch.nn.Conv2d(3, 6, 5)
        self.pool = torch.nn.MaxPool2d(2, 2)
        self.conv2 = torch.nn.Conv2d(6, 16, 5)
        self.fc1 = torch.nn.Linear(16 * 5 * 5, 120)
        self.fc2 = torch.nn.Linear(120, 84)
        self.fc3 = torch.nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class TestCifar10(unittest.TestCase):
    def test(self):
        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])
        trainset = torchvision.datasets.CIFAR10(root='./data',
                                                train=True,
                                                download=True,
                                                transform=transform)
        trainloader = torch.utils.data.DataLoader(trainset,
                                                  batch_size=4,
                                                  shuffle=True,
                                                  num_workers=2)
        testset = torchvision.datasets.CIFAR10(root='./data',
                                               train=False,
                                               download=True,
                                               transform=transform)
        testloader = torch.utils.data.DataLoader(testset,
                                                 batch_size=4,
                                                 shuffle=True,
                                                 num_workers=2)

        dataiter = iter(trainloader)
        images, labels = dataiter.next()
        net = Net()
        register_quantization_hook(net)
        net.cuda()
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)


        print("Cifar10 training:")
        for epoch in range(5):
            net.train()
            if epoch > 2:
                net = merge_freeze_bn(net)
            running_loss = 0.0
            for i, data in enumerate(trainloader, 0):
                inputs, labels = data
                inputs, labels = Variable(inputs.cuda()), Variable(
                    labels.cuda())
                optimizer.zero_grad()
                outputs = net(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                net.apply(unquant_weight)
                optimizer.step()

                running_loss += loss.item()
                if i % 2000 == 1999:
                    print(' epoch %3d, Iter %5d, loss: %.3f' %
                                (epoch + 1, i + 1, running_loss / 2000))
                    running_loss = 0.0
        print('Finished Training.')

        # net.apply(quant_dequant_weight)
        correct = total = 0
        for data in testloader:
            images, labels = data
            outputs = net(Variable(images.cuda()))
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels.cuda()).sum()
            total += labels.size(0)
        print(
            'Accuracy(10000 test images, modules\' weight unquantize): %d %%' %
            (100.0 * correct / total))


if __name__ == "__main__":
    suite = unittest.TestSuite()
    suite.addTest(TestCifar10("test"))
    runner = unittest.TextTestRunner()
    runner.run(suite)


================================================
FILE: tests/test_imagenet.py
================================================
import argparse
import os
import random
import shutil
import time
import warnings

from nnieqat import quant_dequant_weight, unquant_weight, merge_freeze_bn, register_quantization_hook
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.multiprocessing as mp
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models

model_names = sorted(name for name in models.__dict__
    if name.islower() and not name.startswith("__")
    and callable(models.__dict__[name]))

parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
parser.add_argument('data', metavar='DIR',
                    help='path to dataset')
parser.add_argument('-a', '--arch', metavar='ARCH', default='squeezenet1_1',
                    choices=model_names,
                    help='model architecture: ' +
                        ' | '.join(model_names) +
                        ' (default: resnet18)')
parser.add_argument('-j', '--workers', default=32, type=int, metavar='N',
                    help='number of data loading workers (default: 4)')
parser.add_argument('--epochs', default=120, type=int, metavar='N',
                    help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
                    help='manual epoch number (useful on restarts)')
parser.add_argument('-b', '--batch-size', default=256, type=int,
                    metavar='N',
                    help='mini-batch size (default: 256), this is the total '
                         'batch size of all GPUs on the current node when '
                         'using Data Parallel or Distributed Data Parallel')
parser.add_argument('--lr', '--learning-rate', default=0.001, type=float,
                    metavar='LR', help='initial learning rate', dest='lr')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
                    help='momentum')
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
                    metavar='W', help='weight decay (default: 1e-4)',
                    dest='weight_decay')
parser.add_argument('-p', '--print-freq', default=10, type=int,
                    metavar='N', help='print frequency (default: 10)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
                    help='path to latest checkpoint (default: none)')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
                    help='evaluate model on validation set')
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
                    help='use pre-trained model')
parser.add_argument('--world-size', default=-1, type=int,
                    help='number of nodes for distributed training')
parser.add_argument('--rank', default=-1, type=int,
                    help='node rank for distributed training')
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
                    help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
                    help='distributed backend')
parser.add_argument('--seed', default=None, type=int,
                    help='seed for initializing training. ')
parser.add_argument('--gpu', default=None, type=int,
                    help='GPU id to use.')
parser.add_argument('--multiprocessing-distributed', action='store_true',
                    help='Use multi-processing distributed training to launch '
                         'N processes per node, which has N GPUs. This is the '
                         'fastest way to use PyTorch for either single node or '
                         'multi node data parallel training')

best_acc1 = 0


def main():
    args = parser.parse_args()

    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    if args.gpu is not None:
        warnings.warn('You have chosen a specific GPU. This will completely '
                      'disable data parallelism.')

    if args.dist_url == "env://" and args.world_size == -1:
        args.world_size = int(os.environ["WORLD_SIZE"])

    args.distributed = args.world_size > 1 or args.multiprocessing_distributed

    ngpus_per_node = torch.cuda.device_count()
    if args.multiprocessing_distributed:
        # Since we have ngpus_per_node processes per node, the total world_size
        # needs to be adjusted accordingly
        args.world_size = ngpus_per_node * args.world_size
        # Use torch.multiprocessing.spawn to launch distributed processes: the
        # main_worker process function
        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
    else:
        # Simply call main_worker function
        main_worker(args.gpu, ngpus_per_node, args)


def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    args.gpu = gpu

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size, rank=args.rank)
    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()

    register_quantization_hook(model)

    if not torch.cuda.is_available():
        print('using CPU, this will be slow')
    elif args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    optimizer = torch.optim.SGD(model.parameters(), args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True, sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args)

        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if not args.multiprocessing_distributed or (args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            # dump weight quantized model.
            model.apply(quant_dequant_weight)
            save_checkpoint({
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                'optimizer': optimizer.state_dict(),
            }, is_best)
            model.apply(unquant_weight)


def train(train_loader, model, criterion, optimizer, epoch, args):
    batch_time = AverageMeter('Time', ':6.3f')
    data_time = AverageMeter('Data', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, top1, top5],
        prefix="Epoch: [{}]".format(epoch))

    # switch to train mode
    model.train()
    model = merge_freeze_bn(model)
    end = time.time()

    for i, (images, target) in enumerate(train_loader): 
        # measure data loading time
        data_time.update(time.time() - end)

        if args.gpu is not None:
            images = images.cuda(args.gpu, non_blocking=True)
        if torch.cuda.is_available():
            target = target.cuda(args.gpu, non_blocking=True)

        # compute output
        output = model(images)
        loss = criterion(output, target)

        # measure accuracy and record loss
        acc1, acc5 = accuracy(output, target, topk=(1, 5))
        losses.update(loss.item(), images.size(0))
        top1.update(acc1[0], images.size(0))
        top5.update(acc5[0], images.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        model.apply(unquant_weight)
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            progress.display(i)


def validate(val_loader, model, criterion, args):
    batch_time = AverageMeter('Time', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top5 = AverageMeter('Acc@5', ':6.2f')
    progress = ProgressMeter(
        len(val_loader),
        [batch_time, losses, top1, top5],
        prefix='Test: ')

    # switch to evaluate mode
    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (images, target) in enumerate(val_loader):
            if args.gpu is not None:
                images = images.cuda(args.gpu, non_blocking=True)
            if torch.cuda.is_available():
                target = target.cuda(args.gpu, non_blocking=True)

            # compute output
            output = model(images)
            loss = criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % args.print_freq == 0:
                progress.display(i)

        # TODO: this should also be done with the ProgressMeter
        print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
              .format(top1=top1, top5=top5))

    return top1.avg


def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)


class ProgressMeter(object):
    def __init__(self, num_batches, meters, prefix=""):
        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
        self.meters = meters
        self.prefix = prefix

    def display(self, batch):
        entries = [self.prefix + self.batch_fmtstr.format(batch)]
        entries += [str(meter) for meter in self.meters]
        print('\t'.join(entries))

    def _get_batch_fmtstr(self, num_batches):
        num_digits = len(str(num_batches // 1))
        fmt = '{:' + str(num_digits) + 'd}'
        return '[' + fmt + '/' + fmt.format(num_batches) + ']'


def adjust_learning_rate(optimizer, epoch, args):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    lr = args.lr * (0.975 ** (epoch // 3))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res


if __name__ == '__main__':
    main()


================================================
FILE: tests/test_merge_freeze_bn.py
================================================
# -*- coding:utf-8 -*-
import unittest
from ddt import ddt, data
import torch
from torch import nn
from nnieqat import merge_freeze_bn, freeze_bn


@ddt
class TestMergeFreezeBNImpl(unittest.TestCase):
    def conv_bn(inp,
                oup,
                stride,
                conv_layer=nn.Conv2d,
                norm_layer=nn.BatchNorm2d):
        return nn.Sequential(conv_layer(inp, oup, 3, stride, 1, bias=False),
                             norm_layer(oup))

    def conv_1x1_bn(inp, oup, conv_layer=nn.Conv2d, norm_layer=nn.BatchNorm2d):
        return nn.Sequential(conv_layer(inp, oup, 1, 1, 0, bias=False),
                             norm_layer(oup))

    data1 = conv_bn(3, 3, 2)
    data2 = conv_1x1_bn(3, 3)

    @data(data1, data2)
    def test(self, m):
        input = torch.randn(1, 3, 10, 10)
        m.eval()
        output_0 = m(input)
        print("module parameter before merge_freeze_bn: ")
        print(list(m.named_parameters()))

        m = merge_freeze_bn(m)
        m.eval()
        output_1 = m(input)
        print("module parameter after merge_freeze_bn: ")
        print(list(m.named_parameters()))

        print("output result before merge_freeze_bn: ")
        print(output_0)
        print("output result after merge_freeze_bn: ")
        print(output_1)
        print("output result diff: ")
        print(output_0 - output_1)


if __name__ == "__main__":
    suite = unittest.TestSuite()
    suite.addTest(TestMergeFreezeBNImpl("test"))
    runner = unittest.TextTestRunner()
    runner.run(suite)


================================================
FILE: tests/test_quant_impl.py
================================================
# -*- coding:utf-8 -*-
import unittest
from ddt import ddt, data
import math
import ctypes
import datetime
from ctypes import *
import numpy as np
from numba import cuda
import numpy as np
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

@ddt
class TestQuantImpl(unittest.TestCase):
    max_thres = 512
    data0 = np.array([0])
    data1 = np.array([v / 25600 + 1.04
                      for v in range(25600)] + [100, max_thres])
    data2 = np.array([v / 25600 + 1.04
                      for v in range(25600)] + [100, max_thres])
    data2 = np.array([-v / 25600 - 1.04
                      for v in range(25600)] + [-100, -max_thres])
    data3 = np.array(
        [0, 1, 2, 2.03992188, 2.03996094, 3, 4, 5, 10, 100, max_thres])
    max_thres = 513
    data4 = np.array([v / 25600 + 1.04
                      for v in range(25600)] + [100, max_thres])
    data5 = np.array([v / 25600 + 1.04
                      for v in range(25600)] + [100, max_thres])
    data6 = np.array([-v / 25600 - 1.04
                      for v in range(25600)] + [-100, -max_thres])
    data7 = np.array(
        [0, 1, 2, 2.03992188, 2.03996094, 3, 4, 5, 10, 100, max_thres])
    data8 = np.array([
        0, -1, -2, -2.03992188, -2.03996094, -3, -4, -5, -10, -100, -max_thres
    ])
    data9 = np.array(range(1234))
    data10 = np.array([-v for v in range(1234)])

    @data(data0, data1, data2, data3, data4, data5, data6, data7, data8, data9,
          data10)
    def test(self, data):
        os.environ['CUDA_VISIBLE_DEVICES'] = '0'
        # load library
        dl = ctypes.cdll.LoadLibrary
        quant_lib = dl("nnieqat/gpu/lib/libgfpq_gpu.so")
        _libcublas = ctypes.cdll.LoadLibrary("libcublas.so")

        # struct GFPQ_PARAM_ST in gfpq.hpp
        class GFPQ_PARAM_ST(ctypes.Structure):
            _fields_ = [("mode", ctypes.c_int), ("buf", ctypes.c_byte * 16)]

        class _types:
            """Some alias types."""
            handle = ctypes.c_void_p
            stream = ctypes.c_void_p

        data_origin = data.copy()

        print(
            "----------------------------------------------------------------------"
        )
        print("\n\nOriginal data:")
        print(data)

        data = data.astype(np.float32)
        stream = cuda.stream()

        _libcublas.cublasCreate_v2.restype = int
        _libcublas.cublasCreate_v2.argtypes = [ctypes.c_void_p]
        cublas_handle = _types.handle()
        _libcublas.cublasCreate_v2(ctypes.byref(cublas_handle))

        data_gpu = cuda.to_device(data, stream=stream)
        data_p = data_gpu.device_ctypes_pointer
        bit_width = 8

        param = GFPQ_PARAM_ST()
        # init or update param first
        param.mode = 0
        ret = quant_lib.HI_GFPQ_QuantAndDeQuant_GPU_PY(data_p, data.size,
                                                       bit_width,
                                                       ctypes.byref(param),
                                                       stream.handle,
                                                       cublas_handle)
        if ret != 0:
            print("HI_GFPQ_QuantAndDeQuant failed(%d)\n" % (ret)),

        # use apply param
        param.mode = 2
        ret = quant_lib.HI_GFPQ_QuantAndDeQuant_GPU_PY(data_p, data.size,
                                                       bit_width,
                                                       ctypes.byref(param),
                                                       stream.handle,
                                                       cublas_handle)
        if ret != 0:
            print("HI_GFPQ_QuantAndDeQuant failed(%d)" % (ret)),

        data_gpu.copy_to_host(data, stream=stream)
        # data may not be available
        stream.synchronize()
        _libcublas.cublasDestroy_v2(cublas_handle)

        import nnieqat
        from quant_impl import fake_quantize
        import torch
        tensor = torch.Tensor(data_origin).cuda()
        tensor.data = fake_quantize(tensor.data.detach(), 8)

        diff = abs(tensor.cpu().numpy() - data)
        # diff_thres = np.max(abs(data)) * 0.001
        # print("\nDIFF > 0.1%: ")
        # print("idx: ", np.where(diff > diff_thres))
        # print("Original data:", data_origin[np.where(diff > diff_thres)])
        # print("GFPQ result:", data[np.where(diff > diff_thres)])
        # print("Impl result:", tensor.cpu().numpy()[np.where(diff > diff_thres)])
        diff_max = np.max(diff)
        print("\nDIFF MAX: " + str(diff_max))
        print("\nDIFF RATIO: " +
              str(diff_max / max(np.max(abs(data)), pow(10, -18))))


if __name__ == "__main__":
    suite = unittest.TestSuite()
    suite.addTest(TestQuantImpl("test"))
    runner = unittest.TextTestRunner()
    runner.run(suite)

Download .txt

gitextract_yis4nxki/

├── LICENSE.txt
├── MANIFEST.in
├── Makefile
├── README.md
├── build_helper.py
├── docker/
│   └── Dockerfile
├── docs/
│   ├── Makefile
│   ├── make.bat
│   └── source/
│       ├── build_helper.rst
│       ├── conf.py
│       ├── index.rst
│       ├── modules.rst
│       ├── nnieqat.cuda10.rst
│       ├── nnieqat.modules.rst
│       ├── nnieqat.rst
│       └── setup.rst
├── nnieqat/
│   ├── __init__.py
│   ├── cuda10/
│   │   ├── LICENSE.txt
│   │   └── lib/
│   │       ├── gfpq.lib
│   │       ├── libgfpq.a
│   │       ├── libgfpq.so.1.1.5
│   │       ├── libgfpq_gpu.a
│   │       └── libgfpq_gpu.so.1.1.5
│   └── quantize.py
├── pyproject.toml
├── setup.cfg
├── setup.py
├── src/
│   ├── fake_quantize.cpp
│   ├── fake_quantize.cu
│   ├── fake_quantize.h
│   └── test/
│       ├── Makefile
│       └── test.cu
└── tests/
    ├── test_cifar10.py
    ├── test_imagenet.py
    ├── test_merge_freeze_bn.py
    └── test_quant_impl.py

Download .txt

SYMBOL INDEX (53 symbols across 7 files)

FILE: build_helper.py
  function print_warning (line 9) | def print_warning(*lines):
  function get_path (line 16) | def get_path(key):
  function search_on_path (line 20) | def search_on_path(filenames):
  function get_compiler_setting (line 34) | def get_compiler_setting():
  function check_cuda_version (line 78) | def check_cuda_version():
  function check_cudnn_version (line 109) | def check_cudnn_version():
  function build_and_run (line 137) | def build_and_run(compiler,

FILE: nnieqat/quantize.py
  class GFPQParamSt (line 17) | class GFPQParamSt(ctypes.Structure):
  class _types (line 22) | class _types:
  class QuantAndDeQuantGPU (line 28) | class QuantAndDeQuantGPU():
    method __init__ (line 31) | def __init__(self,
    method __call__ (line 50) | def __call__(self, tensor, mode=0):
  function _fuse_conv_bn_weights (line 117) | def _fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn...
  function _fuse_conv_bn (line 145) | def _fuse_conv_bn(conv, bn):
  function _fuse_modules (line 152) | def _fuse_modules(model):
  function freeze_bn (line 200) | def freeze_bn(m, freeze_bn_affine=True):
  function merge_freeze_bn (line 221) | def merge_freeze_bn(model):
  function unquant_weight (line 235) | def unquant_weight(m):
  function quant_dequant_weight (line 250) | def quant_dequant_weight(m):
  function _quantizing_activation (line 271) | def _quantizing_activation(module, input, output):
  function _quantizing_data (line 291) | def _quantizing_data(module, input):
  function _quantizing_weight (line 313) | def _quantizing_weight(module, input):
  function register_quantization_hook (line 326) | def register_quantization_hook(model,
  function test (line 368) | def test():

FILE: src/fake_quantize.cpp
  function Tensor (line 7) | Tensor fake_quantize(Tensor a, int bit_width){
  function PYBIND11_MODULE (line 12) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){

FILE: tests/test_cifar10.py
  class Net (line 14) | class Net(nn.Module):
    method __init__ (line 15) | def __init__(self):
    method forward (line 24) | def forward(self, x):
  class TestCifar10 (line 33) | class TestCifar10(unittest.TestCase):
    method test (line 34) | def test(self):

FILE: tests/test_imagenet.py
  function main (line 81) | def main():
  function main_worker (line 116) | def main_worker(gpu, ngpus_per_node, args):
  function train (line 271) | def train(train_loader, model, criterion, optimizer, epoch, args):
  function validate (line 320) | def validate(val_loader, model, criterion, args):
  function save_checkpoint (line 365) | def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
  class AverageMeter (line 371) | class AverageMeter(object):
    method __init__ (line 373) | def __init__(self, name, fmt=':f'):
    method reset (line 378) | def reset(self):
    method update (line 384) | def update(self, val, n=1):
    method __str__ (line 390) | def __str__(self):
  class ProgressMeter (line 395) | class ProgressMeter(object):
    method __init__ (line 396) | def __init__(self, num_batches, meters, prefix=""):
    method display (line 401) | def display(self, batch):
    method _get_batch_fmtstr (line 406) | def _get_batch_fmtstr(self, num_batches):
  function adjust_learning_rate (line 412) | def adjust_learning_rate(optimizer, epoch, args):
  function accuracy (line 419) | def accuracy(output, target, topk=(1,)):

FILE: tests/test_merge_freeze_bn.py
  class TestMergeFreezeBNImpl (line 10) | class TestMergeFreezeBNImpl(unittest.TestCase):
    method conv_bn (line 11) | def conv_bn(inp,
    method conv_1x1_bn (line 19) | def conv_1x1_bn(inp, oup, conv_layer=nn.Conv2d, norm_layer=nn.BatchNor...
    method test (line 27) | def test(self, m):

FILE: tests/test_quant_impl.py
  class TestQuantImpl (line 15) | class TestQuantImpl(unittest.TestCase):
    method test (line 43) | def test(self, data):

Download .json

Condensed preview — 36 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (81K chars).

[
  {
    "path": "LICENSE.txt",
    "chars": 1063,
    "preview": "MIT License\n\nCopyright (c) Minqin Chen\n\nPermission is hereby granted, free of charge, to any person obtaining a copy of\n"
  },
  {
    "path": "MANIFEST.in",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "Makefile",
    "chars": 3096,
    "preview": "# Uncomment for debugging\n# DEBUG := 1\n# Pretty build\n# Q ?= @\n\nCXX := g++\npython := python3\nPYTHON_HEADER_DIR := $(shel"
  },
  {
    "path": "README.md",
    "chars": 6662,
    "preview": "# nnieqat-pytorch\n\nNnieqat is a quantize aware training package for  Neural Network Inference Engine(NNIE) on pytorch, i"
  },
  {
    "path": "build_helper.py",
    "chars": 5352,
    "preview": "import os\nimport shutil\nimport subprocess\nimport sys\nimport tempfile\nfrom distutils import ccompiler\n\n\ndef print_warning"
  },
  {
    "path": "docker/Dockerfile",
    "chars": 480,
    "preview": "ARG PYTORCH=\"1.6.0\"\nARG CUDA=\"10.1\"\nARG CUDNN=\"7\"\n\nFROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel\n\nENV "
  },
  {
    "path": "docs/Makefile",
    "chars": 638,
    "preview": "# Minimal makefile for Sphinx documentation\n#\n\n# You can set these variables from the command line, and also\n# from the "
  },
  {
    "path": "docs/make.bat",
    "chars": 799,
    "preview": "@ECHO OFF\r\n\r\npushd %~dp0\r\n\r\nREM Command file for Sphinx documentation\r\n\r\nif \"%SPHINXBUILD%\" == \"\" (\r\n\tset SPHINXBUILD=sp"
  },
  {
    "path": "docs/source/build_helper.rst",
    "chars": 126,
    "preview": "build\\_helper module\n====================\n\n.. automodule:: build_helper\n   :members:\n   :undoc-members:\n   :show-inherit"
  },
  {
    "path": "docs/source/conf.py",
    "chars": 2733,
    "preview": "# -*- coding: utf-8 -*-\n#\nimport os\nimport sys\nsys.path.insert(0, os.path.abspath('./../../'))\n\n\n# -- Project informatio"
  },
  {
    "path": "docs/source/index.rst",
    "chars": 437,
    "preview": ".. nnieqat documentation master file, created by\n   sphinx-quickstart on Fri Aug 21 03:52:34 2020.\n   You can adapt this"
  },
  {
    "path": "docs/source/modules.rst",
    "chars": 58,
    "preview": "nnieqat\n=======\n\n.. toctree::\n   :maxdepth: 4\n\n   nnieqat\n"
  },
  {
    "path": "docs/source/nnieqat.cuda10.rst",
    "chars": 347,
    "preview": "nnieqat.cuda10 package\n======================\n\nSubmodules\n----------\n\nnnieqat.cuda10.quantize module\n-------------------"
  },
  {
    "path": "docs/source/nnieqat.modules.rst",
    "chars": 654,
    "preview": "nnieqat.modules package\n=======================\n\nSubmodules\n----------\n\nnnieqat.modules.conv module\n--------------------"
  },
  {
    "path": "docs/source/nnieqat.rst",
    "chars": 236,
    "preview": "nnieqat package\n===============\n\nSubpackages\n-----------\n\n.. toctree::\n\n   nnieqat.cuda10\n   nnieqat.gpu\n   nnieqat.modu"
  },
  {
    "path": "docs/source/setup.rst",
    "chars": 103,
    "preview": "setup module\n============\n\n.. automodule:: setup\n   :members:\n   :undoc-members:\n   :show-inheritance:\n"
  },
  {
    "path": "nnieqat/__init__.py",
    "chars": 409,
    "preview": "\"\"\" quantize aware training package for  Neural Network Inference Engine(NNIE) on pytorch.\n\"\"\"\nimport sys\ntry:\n    from "
  },
  {
    "path": "nnieqat/cuda10/LICENSE.txt",
    "chars": 1371,
    "preview": "/*\n * Copyright (c) 2018, Hisilicon Limited\n * All rights reserved.\n *\n * Redistribution and use in source and binary fo"
  },
  {
    "path": "nnieqat/quantize.py",
    "chars": 14288,
    "preview": "#!/usr/bin/env python\n\"\"\"Quantize function.\n\"\"\"\n\nimport ctypes\nimport datetime\nimport logging\nfrom os.path import abspat"
  },
  {
    "path": "pyproject.toml",
    "chars": 98,
    "preview": "[build-system]\nrequires = [\"setuptools>=40.8.0\", \"wheel\"]\nbuild-backend = \"setuptools.build_meta\"\n"
  },
  {
    "path": "setup.cfg",
    "chars": 39,
    "preview": "[metadata]\nlicense_files = LICENSE.txt\n"
  },
  {
    "path": "setup.py",
    "chars": 2318,
    "preview": "from setuptools import setup, find_packages\nimport pathlib\nfrom torch.utils.cpp_extension import BuildExtension, CUDAExt"
  },
  {
    "path": "src/fake_quantize.cpp",
    "chars": 477,
    "preview": "#include \"fake_quantize.h\"\n\n#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x \" must be a CUDA tensor\")\n#define CHECK_CO"
  },
  {
    "path": "src/fake_quantize.cu",
    "chars": 4061,
    "preview": "#include \"fake_quantize.h\"\n__global__ void fake_quantize_kernel_cuda(float* __restrict__ a,\n                            "
  },
  {
    "path": "src/fake_quantize.h",
    "chars": 565,
    "preview": "#include <cstdlib>\n#include <math.h>\n#include <cuda.h>\n#include <cuda_runtime.h>\n#include <climits>\n#include <stdint.h>\n"
  },
  {
    "path": "src/test/Makefile",
    "chars": 3036,
    "preview": "# Uncomment for debugging\nDEBUG := 1\n# Pretty build\n# Q ?= @\n\nCXX := g++\npython := python3\nPYTHON_HEADER_DIR := $(shell "
  },
  {
    "path": "src/test/test.cu",
    "chars": 155,
    "preview": "#include <stdio.h>\n#include \"../fake_quantize.h\"\n\nint main(int argc, char *argv[])\n{\n\tTensor input = randn({2, 2});\n\tfak"
  },
  {
    "path": "tests/test_cifar10.py",
    "chars": 3992,
    "preview": "# -*- coding:utf-8 -*-\nfrom nnieqat import quant_dequant_weight, unquant_weight, merge_freeze_bn, register_quantization_"
  },
  {
    "path": "tests/test_imagenet.py",
    "chars": 16917,
    "preview": "import argparse\nimport os\nimport random\nimport shutil\nimport time\nimport warnings\n\nfrom nnieqat import quant_dequant_wei"
  },
  {
    "path": "tests/test_merge_freeze_bn.py",
    "chars": 1549,
    "preview": "# -*- coding:utf-8 -*-\nimport unittest\nfrom ddt import ddt, data\nimport torch\nfrom torch import nn\nfrom nnieqat import m"
  },
  {
    "path": "tests/test_quant_impl.py",
    "chars": 4786,
    "preview": "# -*- coding:utf-8 -*-\nimport unittest\nfrom ddt import ddt, data\nimport math\nimport ctypes\nimport datetime\nfrom ctypes i"
  }
]

// ... and 5 more files (download for full content)

About this extraction

This page contains the full source code of the aovoc/nnieqat-pytorch GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 36 files (75.0 KB), approximately 20.3k tokens, and a symbol index with 53 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo